Skip to content

Commit d377ab0

Browse files
authored
feat: spill profile metrics (#19075)
* Fix spill read profile classification * Unify spill I/O profiling by actual location * Add spill profile unit tests * Fix visibility and borrow issues for spill profiling * Add spill fallback integration test * Fix spill metrics: use actual locations and expose helper * Carry spill locations through row group writer for profiling * Refine AnyFileWriter to carry paths and fix window writer match * Hook spill tests into test module * Remove profile unit test and fix imports for spill fallback * Guard spill fallback test when local spill is unavailable * Ensure test config enables local spill for fallback test * Configure test builder with local spill path via SpillConfig * Stabilize spill fallback test * Refine spill profiling paths * Add spill profile tests for local/remote metrics * Refine spill tests and move SpillTarget test to it suite * Add spill test config helper and fix spill IT headers * Fix clippy needless-borrow in spill read profile * Align spill quota settings and configs * Fix spill config mask test ratios * Refine spill profiling and local writer * Unify spill target and add local path string * Add debug logging for spill temp cleanup * Fix build after removing spill prefix debug * Adjust spill configs and defaults * Fix legacy spill config test expectation * Revert HTTP spill cleanups to main behavior * Fix goldenfile whitespace for configs_table_basic * Adjust configs_table_basic golden trailing lines * Revert vacuum_hook change to main * config: disable implicit local spill cache fallback * Update configs * Align window spill quota default to 40 * chore: adjust default spill local reserved disk ratio to 10% * chore: align window spill quota ratio default to 60 * Update spill adapter and defaults
1 parent 0268210 commit d377ab0

File tree

38 files changed

+709
-210
lines changed

38 files changed

+709
-210
lines changed

scripts/ci/deploy/config/databend-query-node-1.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,6 @@ max_bytes = 21474836480
162162

163163
[spill]
164164
spill_local_disk_path = "./.databend/temp/_query_spill"
165+
# Cap local spill to 5GB so window spills keep ~1GB quota with default 20% ratio.
166+
spill_local_disk_max_bytes = 1073741824
167+
window_partition_spilling_disk_quota_ratio = 20

scripts/databend_test_helper/databend_test_helper/configs/databend-query-node2.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,4 @@ path = "_databend_data/cache/query2"
6262
max_bytes = 1073741824 # 1GB
6363

6464
[spill]
65-
spill_local_disk_path = "_databend_data/spill/query2"
65+
spill_local_disk_path = "_databend_data/spill/query2"

scripts/databend_test_helper/databend_test_helper/configs/databend-query-node3.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,4 @@ path = "_databend_data/cache/query3"
6262
max_bytes = 1073741824 # 1GB
6363

6464
[spill]
65-
spill_local_disk_path = "_databend_data/spill/query3"
65+
spill_local_disk_path = "_databend_data/spill/query3"

scripts/test-bend-tests/configs/query/query-2.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ path = "_databend_data/cache/query2"
6363
max_bytes = 1073741824 # 1GB
6464

6565
[spill]
66-
spill_local_disk_path = "_databend_data/spill/query2"
66+
spill_local_disk_path = "_databend_data/spill/query2"

scripts/test-bend-tests/configs/query/query-3.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ path = "_databend_data/cache/query3"
6363
max_bytes = 1073741824 # 1GB
6464

6565
[spill]
66-
spill_local_disk_path = "_databend_data/spill/query3"
66+
spill_local_disk_path = "_databend_data/spill/query3"

scripts/test-bend-tests/configs/query/query-4.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ path = "_databend_data/cache/query4"
6363
max_bytes = 1073741824 # 1GB
6464

6565
[spill]
66-
spill_local_disk_path = "_databend_data/spill/query4"
66+
spill_local_disk_path = "_databend_data/spill/query4"

scripts/test-bend-tests/configs/query/query-5.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ path = "_databend_data/cache/query5"
6363
max_bytes = 1073741824 # 1GB
6464

6565
[spill]
66-
spill_local_disk_path = "_databend_data/spill/query5"
66+
spill_local_disk_path = "_databend_data/spill/query5"

src/query/config/src/config.rs

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ pub struct FsStorageConfig {
798798

799799
impl FsStorageConfig {
800800
fn default_reserved_space_percentage() -> Option<OrderedFloat<f64>> {
801-
None // Use None as default, will use system default (30.0) if not specified
801+
None // Use None as default, will use system default (10.0) if not specified
802802
}
803803
}
804804

@@ -3585,13 +3585,32 @@ pub struct SpillConfig {
35853585
#[clap(long, value_name = "VALUE", default_value = "")]
35863586
pub spill_local_disk_path: String,
35873587

3588-
#[clap(long, value_name = "VALUE", default_value = "30")]
3588+
#[clap(long, value_name = "VALUE", default_value = "10")]
35893589
/// Percentage of reserve disk space that won't be used for spill to local disk.
35903590
pub spill_local_disk_reserved_space_percentage: OrderedFloat<f64>,
35913591

35923592
#[clap(long, value_name = "VALUE", default_value = "18446744073709551615")]
35933593
/// Allow space in bytes to spill to local disk.
35943594
pub spill_local_disk_max_bytes: u64,
3595+
3596+
/// Maximum percentage of the global local spill quota that a single sort
3597+
/// operator may use for one query.
3598+
///
3599+
/// Value range: 0-100. Effective only when local spill is enabled (there is
3600+
/// a valid local spill path and non-zero `spill_local_disk_max_bytes`).
3601+
#[clap(long, value_name = "PERCENT", default_value = "60")]
3602+
pub sort_spilling_disk_quota_ratio: u64,
3603+
3604+
/// Maximum percentage of the global local spill quota that window
3605+
/// partitioners may use for one query.
3606+
#[clap(long, value_name = "PERCENT", default_value = "60")]
3607+
pub window_partition_spilling_disk_quota_ratio: u64,
3608+
3609+
/// Maximum percentage of the global local spill quota that HTTP
3610+
/// result-set spilling may use for one query.
3611+
/// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
3612+
#[clap(long, value_name = "PERCENT", default_value = "0")]
3613+
pub result_set_spilling_disk_quota_ratio: u64,
35953614
}
35963615

35973616
impl SpillConfig {
@@ -3630,8 +3649,12 @@ impl Default for SpillConfig {
36303649
Self {
36313650
storage: None,
36323651
spill_local_disk_path: String::new(),
3633-
spill_local_disk_reserved_space_percentage: OrderedFloat(30.0),
3652+
spill_local_disk_reserved_space_percentage: OrderedFloat(10.0),
36343653
spill_local_disk_max_bytes: u64::MAX,
3654+
sort_spilling_disk_quota_ratio: 60,
3655+
window_partition_spilling_disk_quota_ratio: 60,
3656+
// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
3657+
result_set_spilling_disk_quota_ratio: 0,
36353658
}
36363659
}
36373660
}
@@ -3706,7 +3729,7 @@ mod cache_config_converters {
37063729
catalogs.insert(CATALOG_HIVE.to_string(), catalog);
37073730
}
37083731

3709-
let spill = convert_local_spill_config(spill, &cache.disk_cache_config)?;
3732+
let spill = convert_local_spill_config(spill)?;
37103733

37113734
Ok(InnerConfig {
37123735
query: query.try_into()?,
@@ -3803,10 +3826,7 @@ mod cache_config_converters {
38033826
}
38043827
}
38053828

3806-
fn convert_local_spill_config(
3807-
spill: SpillConfig,
3808-
cache: &DiskCacheConfig,
3809-
) -> Result<inner::SpillConfig> {
3829+
fn convert_local_spill_config(spill: SpillConfig) -> Result<inner::SpillConfig> {
38103830
// Determine configuration based on auto-detected spill type
38113831
let spill_type = spill.get_spill_type();
38123832
let (local_writeable_root, path, reserved_disk_ratio, global_bytes_limit, storage_params) =
@@ -3818,7 +3838,7 @@ mod cache_config_converters {
38183838
let reserved_ratio = storage
38193839
.fs
38203840
.reserved_space_percentage
3821-
.unwrap_or(OrderedFloat(30.0))
3841+
.unwrap_or(OrderedFloat(10.0))
38223842
/ 100.0;
38233843
let max_bytes = storage.fs.max_bytes.unwrap_or(u64::MAX);
38243844

@@ -3864,16 +3884,11 @@ mod cache_config_converters {
38643884
)
38653885
}
38663886
_ => {
3867-
// Default behavior for "default" type and any unrecognized types
3868-
// Default behavior with backward compatibility
3869-
let local_writeable_root = if cache.path != DiskCacheConfig::default().path
3870-
&& spill.spill_local_disk_path.is_empty()
3871-
{
3872-
Some(cache.path.clone())
3873-
} else {
3874-
None
3875-
};
3876-
3887+
// Default behavior for "default" type and any unrecognized types:
3888+
// do NOT implicitly reuse the data cache disk. Local spill is
3889+
// enabled only when explicitly configured via either
3890+
// - [spill.storage] with type = "fs", or
3891+
// - legacy spill_local_disk_path.
38773892
let storage_params = spill
38783893
.storage
38793894
.map(|storage| {
@@ -3883,7 +3898,7 @@ mod cache_config_converters {
38833898
.transpose()?;
38843899

38853900
(
3886-
local_writeable_root,
3901+
None,
38873902
spill.spill_local_disk_path,
38883903
spill.spill_local_disk_reserved_space_percentage / 100.0,
38893904
spill.spill_local_disk_max_bytes,
@@ -3898,6 +3913,10 @@ mod cache_config_converters {
38983913
reserved_disk_ratio,
38993914
global_bytes_limit,
39003915
storage_params,
3916+
sort_spilling_disk_quota_ratio: spill.sort_spilling_disk_quota_ratio,
3917+
window_partition_spilling_disk_quota_ratio: spill
3918+
.window_partition_spilling_disk_quota_ratio,
3919+
result_set_spilling_disk_quota_ratio: spill.result_set_spilling_disk_quota_ratio,
39013920
})
39023921
}
39033922

@@ -3916,6 +3935,10 @@ mod cache_config_converters {
39163935
spill_local_disk_path: value.path,
39173936
spill_local_disk_reserved_space_percentage: value.reserved_disk_ratio * 100.0,
39183937
spill_local_disk_max_bytes: value.global_bytes_limit,
3938+
sort_spilling_disk_quota_ratio: value.sort_spilling_disk_quota_ratio,
3939+
window_partition_spilling_disk_quota_ratio: value
3940+
.window_partition_spilling_disk_quota_ratio,
3941+
result_set_spilling_disk_quota_ratio: value.result_set_spilling_disk_quota_ratio,
39193942
}
39203943
}
39213944
}

src/query/config/src/inner.rs

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,22 @@ pub struct SpillConfig {
787787
pub global_bytes_limit: u64,
788788

789789
pub storage_params: Option<StorageParams>,
790+
791+
/// Maximum percentage of the global local spill quota that a single
792+
/// sort operator may use for one query.
793+
///
794+
/// Value range: 0-100. Effective only when local spill is enabled
795+
/// (i.e. there is a valid local spill path and non-zero global
796+
/// bytes limit).
797+
pub sort_spilling_disk_quota_ratio: u64,
798+
799+
/// Maximum percentage of the global local spill quota that window
800+
/// partitioners may use for one query.
801+
pub window_partition_spilling_disk_quota_ratio: u64,
802+
803+
/// Maximum percentage of the global local spill quota that HTTP
804+
/// result-set spilling may use for one query.
805+
pub result_set_spilling_disk_quota_ratio: u64,
790806
}
791807

792808
impl SpillConfig {
@@ -807,13 +823,56 @@ impl SpillConfig {
807823
None
808824
}
809825

826+
/// Helper to compute a per-query local spill quota (in bytes) from a
827+
/// percentage of the global local spill limit.
828+
///
829+
/// - If local spill is disabled (no local path or zero global
830+
/// limit), returns 0.
831+
/// - `ratio` is clamped into [0, 100].
832+
pub fn quota_bytes_from_ratio(&self, ratio: u64) -> usize {
833+
// Only effective when local spill is enabled.
834+
if self.local_path().is_none() {
835+
return 0;
836+
}
837+
838+
let ratio = std::cmp::min(ratio, 100);
839+
if ratio == 0 {
840+
return 0;
841+
}
842+
843+
let bytes = self.global_bytes_limit.saturating_mul(ratio) / 100;
844+
845+
// TempDirManager works with `usize` limits.
846+
std::cmp::min(bytes, usize::MAX as u64) as usize
847+
}
848+
849+
/// Per-query quota for sort operators.
850+
pub fn sort_spill_bytes_limit(&self) -> usize {
851+
self.quota_bytes_from_ratio(self.sort_spilling_disk_quota_ratio)
852+
}
853+
854+
/// Per-query quota for window partitioners.
855+
pub fn window_partition_spill_bytes_limit(&self) -> usize {
856+
self.quota_bytes_from_ratio(self.window_partition_spilling_disk_quota_ratio)
857+
}
858+
859+
/// Per-query quota for HTTP result-set spilling.
860+
pub fn result_set_spill_bytes_limit(&self) -> usize {
861+
self.quota_bytes_from_ratio(self.result_set_spilling_disk_quota_ratio)
862+
}
863+
810864
pub fn new_for_test(path: String, reserved_disk_ratio: f64, global_bytes_limit: u64) -> Self {
811865
Self {
812866
local_writeable_root: None,
813867
path,
814868
reserved_disk_ratio: OrderedFloat(reserved_disk_ratio),
815869
global_bytes_limit,
816870
storage_params: None,
871+
// Use the same defaults as the external config.
872+
sort_spilling_disk_quota_ratio: 60,
873+
window_partition_spilling_disk_quota_ratio: 60,
874+
// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
875+
result_set_spilling_disk_quota_ratio: 0,
817876
}
818877
}
819878
}
@@ -823,9 +882,13 @@ impl Default for SpillConfig {
823882
Self {
824883
local_writeable_root: None,
825884
path: "".to_string(),
826-
reserved_disk_ratio: OrderedFloat(0.3),
885+
reserved_disk_ratio: OrderedFloat(0.1),
827886
global_bytes_limit: u64::MAX,
828887
storage_params: None,
888+
sort_spilling_disk_quota_ratio: 60,
889+
window_partition_spilling_disk_quota_ratio: 60,
890+
// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
891+
result_set_spilling_disk_quota_ratio: 0,
829892
}
830893
}
831894
}

src/query/config/src/mask.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,13 +215,20 @@ impl SpillConfig {
215215
ref spill_local_disk_path,
216216
spill_local_disk_reserved_space_percentage,
217217
spill_local_disk_max_bytes,
218+
sort_spilling_disk_quota_ratio,
219+
window_partition_spilling_disk_quota_ratio,
220+
result_set_spilling_disk_quota_ratio,
218221
} = *self;
219222

220223
Self {
221224
storage: storage.as_ref().map(|storage| storage.mask_display()),
222225
spill_local_disk_path: spill_local_disk_path.clone(),
223226
spill_local_disk_reserved_space_percentage,
224227
spill_local_disk_max_bytes,
228+
sort_spilling_disk_quota_ratio,
229+
window_partition_spilling_disk_quota_ratio,
230+
// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
231+
result_set_spilling_disk_quota_ratio,
225232
}
226233
}
227234
}
@@ -375,6 +382,10 @@ mod tests {
375382
spill_local_disk_path: "".to_string(),
376383
spill_local_disk_reserved_space_percentage: 30.0.into(),
377384
spill_local_disk_max_bytes: 10,
385+
sort_spilling_disk_quota_ratio: 60,
386+
window_partition_spilling_disk_quota_ratio: 30,
387+
// TODO: keep 0 to avoid deleting local result-set spill dir before HTTP pagination finishes.
388+
result_set_spilling_disk_quota_ratio: 0,
378389
storage: Some(StorageConfig {
379390
typ: "s3".to_string(),
380391
s3: S3StorageConfig {

0 commit comments

Comments
 (0)