Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions quickwit/quickwit-storage/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mod cache;
mod debouncer;
mod file_descriptor_cache;
mod metrics;
mod metrics_wrappers;
mod storage;
mod timeout_and_retry_storage;
pub use debouncer::AsyncDebouncer;
Expand Down
104 changes: 37 additions & 67 deletions quickwit/quickwit-storage/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

use once_cell::sync::Lazy;
use quickwit_common::metrics::{
GaugeGuard, Histogram, IntCounter, IntCounterVec, IntGauge, new_counter, new_counter_vec,
GaugeGuard, HistogramVec, IntCounter, IntCounterVec, IntGauge, new_counter, new_counter_vec,
new_gauge, new_histogram_vec,
};

Expand All @@ -30,19 +30,13 @@ pub struct StorageMetrics {
pub searcher_split_cache: CacheMetrics,
pub get_slice_timeout_successes: [IntCounter; 3],
pub get_slice_timeout_all_timeouts: IntCounter,
pub object_storage_get_total: IntCounter,
pub object_storage_get_errors_total: IntCounterVec<1>,
pub object_storage_requests_total: IntCounterVec<2>,
pub object_storage_request_duration: HistogramVec<2>,
pub object_storage_get_slice_in_flight_count: IntGauge,
pub object_storage_get_slice_in_flight_num_bytes: IntGauge,
pub object_storage_put_total: IntCounter,
pub object_storage_put_parts: IntCounter,
pub object_storage_download_num_bytes: IntCounter,
pub object_storage_upload_num_bytes: IntCounter,

pub object_storage_delete_requests_total: IntCounter,
pub object_storage_bulk_delete_requests_total: IntCounter,
pub object_storage_delete_request_duration: Histogram,
pub object_storage_bulk_delete_request_duration: Histogram,
pub object_storage_download_num_bytes: IntCounterVec<1>,
pub object_storage_download_errors: IntCounterVec<1>,
pub object_storage_upload_num_bytes: IntCounterVec<1>,
}

impl Default for StorageMetrics {
Expand All @@ -63,31 +57,6 @@ impl Default for StorageMetrics {
let get_slice_timeout_all_timeouts =
get_slice_timeout_outcome_total_vec.with_label_values(["all_timeouts"]);

let object_storage_requests_total = new_counter_vec(
"object_storage_requests_total",
"Total number of object storage requests performed.",
"storage",
&[],
["action"],
);
let object_storage_delete_requests_total =
object_storage_requests_total.with_label_values(["delete_object"]);
let object_storage_bulk_delete_requests_total =
object_storage_requests_total.with_label_values(["delete_objects"]);

let object_storage_request_duration = new_histogram_vec(
"object_storage_request_duration_seconds",
"Duration of object storage requests in seconds.",
"storage",
&[],
["action"],
vec![0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0],
);
let object_storage_delete_request_duration =
object_storage_request_duration.with_label_values(["delete_object"]);
let object_storage_bulk_delete_request_duration =
object_storage_request_duration.with_label_values(["delete_objects"]);

StorageMetrics {
fast_field_cache: CacheMetrics::for_component("fastfields"),
fd_cache_metrics: CacheMetrics::for_component("fd"),
Expand All @@ -97,62 +66,63 @@ impl Default for StorageMetrics {
split_footer_cache: CacheMetrics::for_component("splitfooter"),
get_slice_timeout_successes,
get_slice_timeout_all_timeouts,
object_storage_get_total: new_counter(
"object_storage_gets_total",
"Number of objects fetched. Might be lower than get_slice_timeout_outcome if \
queries are debounced.",
object_storage_requests_total: new_counter_vec(
"object_storage_requests_total",
"Number of requests to the object store, by action and status. Requests are \
recorded when the response headers are returned, download failures will not \
appear as errors.",
"storage",
&[],
["action", "status"],
),
object_storage_get_errors_total: new_counter_vec::<1>(
"object_storage_get_errors_total",
"Number of GetObject errors.",
object_storage_request_duration: new_histogram_vec(
Copy link
Collaborator

@fulmicoton-dd fulmicoton-dd Dec 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is ttfb it should be not be called duration!?

(technically I don't think this is ttfb btw :-/ )

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree the terminology TTFB is a bit stretched here as it also applies to other types of queries (deletes). And it's probably not a good idea to call it request duration for GET requests when it doesn't measure the whole download duration.

I still think it's valuable to measure the time it takes for get requests to get to the byte stream. It's valuable when trying to configure the StorageTimeoutPolicy, and it's also a way to measure the read performance of the object store without having to normalize by the downloaded size. Would you be fine with calling that separate metric object_storage_get_ttfb or do you still think it's not appropriate?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdettai what are we measuring exactly?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fmassot The storage SDK returning a ByteStream.
We can hope most of the meta data layer stuff is done at this point.

@rdettai-sk

it's also a way to measure the read performance of the object store without having to normalize by the downloaded size

My comment had two parts.

  • the first one is: you called the metric duration. This needs to be renamed.
  • the ttfb part it does not matter much and I agree your metric is useful. I just don't want people to start comparing hyperscalers and drawing wrong conclusions due to minor implementation differences. I would edit the comment to ttfb-like for instance.

I think we need to rename that metric and keep it...
And then, count + duration total + num bytes total.

With that, at least we have enough to compute a model
TTFB + bytes * throughput.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To summarise the changes on existing metrics:

  • I'll add a new metric called object_storage_get_slice_latency -> help: "TTFB-like latency of the object store get requests in seconds. This doesn't include the time to download the full response payload."
  • object_storage_request_duration is used only for other object store requests (only delete for now) with an updated description -> help: "Duration in seconds of the object store request"

I can add a metric to measure the total duration (download included). Should it be a counter or a histogram?

  • A histogram would be hard to leverage because you loose the connection to the actual download size. If we want the download throughput distribution, I think a histogram of the download throughput (wrapping the download itself with a timer) would be easier to use.
  • If we opt for a counter of the total duration, how would you use it to estimate the throughput?
  • We have retries when receiving an HTTP error response (e.g when receiving 427 slow down). I think failed attempts should not be included in the duration.

"object_storage_request_duration",
"Time to first byte (TTFB) for object store requests, by action and status. For \
download requests, excludes the time to download the response body.",
"storage",
&[],
["code"],
["action", "status"],
vec![0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0],
),
object_storage_get_slice_in_flight_count: new_gauge(
"object_storage_get_slice_in_flight_count",
"Number of GetObject for which the memory was allocated but the download is still \
in progress.",
"Number of get_object for which the memory was allocated but the download is \
still in progress.",
"storage",
&[],
),
object_storage_get_slice_in_flight_num_bytes: new_gauge(
"object_storage_get_slice_in_flight_num_bytes",
"Memory allocated for GetObject requests that are still in progress.",
"Memory allocated for get_object requests that are still in progress.",
"storage",
&[],
),
object_storage_put_total: new_counter(
"object_storage_puts_total",
"Number of objects uploaded. May differ from object_storage_requests_parts due to \
multipart upload.",
object_storage_download_num_bytes: new_counter_vec(
"object_storage_download_num_bytes",
"Amount of data downloaded from object storage.",
"storage",
&[],
["status"],
),
object_storage_put_parts: new_counter(
"object_storage_puts_parts",
"Number of object parts uploaded.",
"",
&[],
),
object_storage_download_num_bytes: new_counter(
"object_storage_download_num_bytes",
"Amount of data downloaded from an object storage.",
object_storage_download_errors: new_counter_vec(
"object_storage_download_errors",
// Download errors are recorded separately because the associated
// get_object requests were already recorded as successful in
// object_storage_requests_total
"Number of download requests that received successful response headers but failed \
during download.",
"storage",
&[],
["status"],
),
object_storage_upload_num_bytes: new_counter(
object_storage_upload_num_bytes: new_counter_vec(
"object_storage_upload_num_bytes",
"Amount of data uploaded to an object storage.",
"Amount of data uploaded to object storage. The value recorded for failed and \
aborted uploads is the full payload size.",
"storage",
&[],
["status"],
),
object_storage_delete_requests_total,
object_storage_bulk_delete_requests_total,
object_storage_delete_request_duration,
object_storage_bulk_delete_request_duration,
}
}
}
Expand Down
Loading