Skip to content

Commit 8809dae

Browse files
authored
perf: Improve performance of ltrim, rtrim, btrim (#19551)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #12576 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> Improve performance. ### ltrim | Scenario | Before | After | Improvement | |----------------------------|---------|---------|-------------| | short strings, string_view | 77.0 µs | 51.2 µs | +33.6% | | short strings, string | 77.9 µs | 47.9 µs | +38.5% | | long strings, short trim | 77.3 µs | 49.1 µs | +36.4% | ### rtrim | Scenario | Before | After | Improvement | |----------------------------|---------|---------|-------------| | short strings, string_view | 80.6 µs | 45.6 µs | +43.4% | | short strings, string | 80.6 µs | 47.4 µs | +41.3% | | long strings, short trim | 78.0 µs | 44.9 µs | +42.5% | ### btrim | Scenario | Before | After | Improvement | |----------------------------|----------|---------|-------------| | short strings, string_view | 106.4 µs | 77.3 µs | +27.4% | | short strings, string | 106.3 µs | 75.1 µs | +29.4% | | long strings, short trim | 109.7 µs | 74.5 µs | +32.1% | ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> - Optimizations - precompute pattern instead of computing per row - Improve benchmark to cover ltrim, rtrim, btrim ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Existing tests ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> No <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 715962c commit 8809dae

File tree

6 files changed

+234
-208
lines changed

6 files changed

+234
-208
lines changed

datafusion/functions/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ required-features = ["unicode_expressions"]
187187

188188
[[bench]]
189189
harness = false
190-
name = "ltrim"
190+
name = "trim"
191191
required-features = ["string_expressions"]
192192

193193
[[bench]]
Lines changed: 125 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -48,28 +48,58 @@ impl fmt::Display for StringArrayType {
4848
}
4949
}
5050

51-
/// returns an array of strings, and `characters` as a ScalarValue
52-
pub fn create_string_array_and_characters(
51+
#[derive(Clone, Copy)]
52+
pub enum TrimType {
53+
Ltrim,
54+
Rtrim,
55+
Btrim,
56+
}
57+
58+
impl fmt::Display for TrimType {
59+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60+
match self {
61+
TrimType::Ltrim => f.write_str("ltrim"),
62+
TrimType::Rtrim => f.write_str("rtrim"),
63+
TrimType::Btrim => f.write_str("btrim"),
64+
}
65+
}
66+
}
67+
68+
/// Returns an array of strings with trim characters positioned according to trim type,
69+
/// and `characters` as a ScalarValue.
70+
///
71+
/// For ltrim: trim characters are at the start (prefix)
72+
/// For rtrim: trim characters are at the end (suffix)
73+
/// For btrim: trim characters are at both start and end
74+
fn create_string_array_and_characters(
5375
size: usize,
5476
characters: &str,
5577
trimmed: &str,
5678
remaining_len: usize,
5779
string_array_type: StringArrayType,
80+
trim_type: TrimType,
5881
) -> (ArrayRef, ScalarValue) {
5982
let rng = &mut StdRng::seed_from_u64(42);
6083

6184
// Create `size` rows:
6285
// - 10% rows will be `None`
63-
// - Other 90% will be strings with same `remaining_len` lengths
64-
// We will build the string array on it later.
86+
// - Other 90% will be strings with `remaining_len` content length
6587
let string_iter = (0..size).map(|_| {
6688
if rng.random::<f32>() < 0.1 {
6789
None
6890
} else {
69-
let mut value = trimmed.as_bytes().to_vec();
70-
let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
71-
value.extend(generated);
72-
Some(String::from_utf8(value).unwrap())
91+
let content: String = rng
92+
.sample_iter(&Alphanumeric)
93+
.take(remaining_len)
94+
.map(char::from)
95+
.collect();
96+
97+
let value = match trim_type {
98+
TrimType::Ltrim => format!("{trimmed}{content}"),
99+
TrimType::Rtrim => format!("{content}{trimmed}"),
100+
TrimType::Btrim => format!("{trimmed}{content}{trimmed}"),
101+
};
102+
Some(value)
73103
}
74104
});
75105

@@ -90,30 +120,22 @@ pub fn create_string_array_and_characters(
90120
}
91121
}
92122

93-
/// Create args for the ltrim benchmark
94-
/// Inputs:
95-
/// - size: rows num of the test array
96-
/// - characters: the characters we need to trim
97-
/// - trimmed: the part in the testing string that will be trimmed
98-
/// - remaining_len: the len of the remaining part of testing string after trimming
99-
/// - string_array_type: the method used to store the testing strings
100-
///
101-
/// Outputs:
102-
/// - testing string array
103-
/// - trimmed characters
123+
/// Create args for the trim benchmark
104124
fn create_args(
105125
size: usize,
106126
characters: &str,
107127
trimmed: &str,
108128
remaining_len: usize,
109129
string_array_type: StringArrayType,
130+
trim_type: TrimType,
110131
) -> Vec<ColumnarValue> {
111132
let (string_array, pattern) = create_string_array_and_characters(
112133
size,
113134
characters,
114135
trimmed,
115136
remaining_len,
116137
string_array_type,
138+
trim_type,
117139
);
118140
vec![
119141
ColumnarValue::Array(string_array),
@@ -124,15 +146,23 @@ fn create_args(
124146
#[allow(clippy::too_many_arguments)]
125147
fn run_with_string_type<M: Measurement>(
126148
group: &mut BenchmarkGroup<'_, M>,
127-
ltrim: &ScalarUDF,
149+
trim_func: &ScalarUDF,
150+
trim_type: TrimType,
128151
size: usize,
129-
len: usize,
152+
total_len: usize,
130153
characters: &str,
131154
trimmed: &str,
132155
remaining_len: usize,
133156
string_type: StringArrayType,
134157
) {
135-
let args = create_args(size, characters, trimmed, remaining_len, string_type);
158+
let args = create_args(
159+
size,
160+
characters,
161+
trimmed,
162+
remaining_len,
163+
string_type,
164+
trim_type,
165+
);
136166
let arg_fields = args
137167
.iter()
138168
.enumerate()
@@ -142,12 +172,12 @@ fn run_with_string_type<M: Measurement>(
142172

143173
group.bench_function(
144174
format!(
145-
"{string_type} [size={size}, len_before={len}, len_after={remaining_len}]",
175+
"{trim_type} {string_type} [size={size}, len={total_len}, remaining={remaining_len}]",
146176
),
147177
|b| {
148178
b.iter(|| {
149179
let args_cloned = args.clone();
150-
black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
180+
black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
151181
args: args_cloned,
152182
arg_fields: arg_fields.clone(),
153183
number_rows: size,
@@ -160,13 +190,14 @@ fn run_with_string_type<M: Measurement>(
160190
}
161191

162192
#[allow(clippy::too_many_arguments)]
163-
fn run_one_group(
193+
fn run_trim_benchmark(
164194
c: &mut Criterion,
165195
group_name: &str,
166-
ltrim: &ScalarUDF,
196+
trim_func: &ScalarUDF,
197+
trim_type: TrimType,
167198
string_types: &[StringArrayType],
168199
size: usize,
169-
len: usize,
200+
total_len: usize,
170201
characters: &str,
171202
trimmed: &str,
172203
remaining_len: usize,
@@ -178,9 +209,10 @@ fn run_one_group(
178209
for string_type in string_types {
179210
run_with_string_type(
180211
&mut group,
181-
ltrim,
212+
trim_func,
213+
trim_type,
182214
size,
183-
len,
215+
total_len,
184216
characters,
185217
trimmed,
186218
remaining_len,
@@ -193,61 +225,79 @@ fn run_one_group(
193225

194226
fn criterion_benchmark(c: &mut Criterion) {
195227
let ltrim = string::ltrim();
228+
let rtrim = string::rtrim();
229+
let btrim = string::btrim();
230+
196231
let characters = ",!()";
197232

198233
let string_types = [
199234
StringArrayType::Utf8View,
200235
StringArrayType::Utf8,
201236
StringArrayType::LargeUtf8,
202237
];
203-
for size in [1024, 4096, 8192] {
204-
// len=12, trimmed_len=4, len_after_ltrim=8
205-
let len = 12;
206-
let trimmed = characters;
207-
let remaining_len = len - trimmed.len();
208-
run_one_group(
209-
c,
210-
"INPUT LEN <= 12",
211-
&ltrim,
212-
&string_types,
213-
size,
214-
len,
215-
characters,
216-
trimmed,
217-
remaining_len,
218-
);
219238

220-
// len=64, trimmed_len=4, len_after_ltrim=60
221-
let len = 64;
222-
let trimmed = characters;
223-
let remaining_len = len - trimmed.len();
224-
run_one_group(
225-
c,
226-
"INPUT LEN > 12, OUTPUT LEN > 12",
227-
&ltrim,
228-
&string_types,
229-
size,
230-
len,
231-
characters,
232-
trimmed,
233-
remaining_len,
234-
);
239+
let trim_funcs = [
240+
(&ltrim, TrimType::Ltrim),
241+
(&rtrim, TrimType::Rtrim),
242+
(&btrim, TrimType::Btrim),
243+
];
235244

236-
// len=64, trimmed_len=56, len_after_ltrim=8
237-
let len = 64;
238-
let trimmed = characters.repeat(15);
239-
let remaining_len = len - trimmed.len();
240-
run_one_group(
241-
c,
242-
"INPUT LEN > 12, OUTPUT LEN <= 12",
243-
&ltrim,
244-
&string_types,
245-
size,
246-
len,
247-
characters,
248-
&trimmed,
249-
remaining_len,
250-
);
245+
for size in [4096] {
246+
for (trim_func, trim_type) in &trim_funcs {
247+
// Scenario 1: Short strings (len <= 12, inline in StringView)
248+
// trimmed_len=4, remaining_len=8
249+
let total_len = 12;
250+
let trimmed = characters;
251+
let remaining_len = total_len - trimmed.len();
252+
run_trim_benchmark(
253+
c,
254+
"short strings (len <= 12)",
255+
trim_func,
256+
*trim_type,
257+
&string_types,
258+
size,
259+
total_len,
260+
characters,
261+
trimmed,
262+
remaining_len,
263+
);
264+
265+
// Scenario 2: Long strings, short trim (len > 12, output > 12)
266+
// trimmed_len=4, remaining_len=60
267+
let total_len = 64;
268+
let trimmed = characters;
269+
let remaining_len = total_len - trimmed.len();
270+
run_trim_benchmark(
271+
c,
272+
"long strings, short trim",
273+
trim_func,
274+
*trim_type,
275+
&string_types,
276+
size,
277+
total_len,
278+
characters,
279+
trimmed,
280+
remaining_len,
281+
);
282+
283+
// Scenario 3: Long strings, long trim (len > 12, output <= 12)
284+
// trimmed_len=56, remaining_len=8
285+
let total_len = 64;
286+
let trimmed = characters.repeat(14);
287+
let remaining_len = total_len - trimmed.len();
288+
run_trim_benchmark(
289+
c,
290+
"long strings, long trim",
291+
trim_func,
292+
*trim_type,
293+
&string_types,
294+
size,
295+
total_len,
296+
characters,
297+
&trimmed,
298+
remaining_len,
299+
);
300+
}
251301
}
252302
}
253303

datafusion/functions/src/string/btrim.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
4040
} else {
4141
args.to_owned()
4242
};
43-
general_trim::<T>(&args, TrimType::Both, use_string_view)
43+
general_trim::<T, TrimBoth>(&args, use_string_view)
4444
}
4545

4646
#[user_doc(

0 commit comments

Comments
 (0)