Skip to content

Commit bc42133

Browse files
authored
fix(query): Fix inverted index matched score caused panic (#19032)
* fix(query): Fix inverted index matched score caused panic * fix
1 parent 7ac02fa commit bc42133

File tree

10 files changed

+118
-78
lines changed

10 files changed

+118
-78
lines changed

src/query/catalog/src/plan/internal_column.rs

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,10 @@ pub struct InternalColumnMeta {
106106
pub offsets: Option<RoaringTreemap>,
107107
pub base_block_ids: Option<Scalar>,
108108
pub inner: Option<BlockMetaInfoPtr>,
109-
// The search matched rows and optional scores in the block.
110-
pub matched_rows: Option<Vec<(usize, Option<F32>)>>,
109+
// The search matched rows in the block (aligned with `matched_scores` when present).
110+
pub matched_rows: Option<Vec<usize>>,
111+
// Optional scores for the matched rows.
112+
pub matched_scores: Option<Vec<F32>>,
111113
// The vector topn rows and scores in the block.
112114
pub vector_scores: Option<Vec<(usize, F32)>>,
113115
}
@@ -280,24 +282,25 @@ impl InternalColumn {
280282
InternalColumnType::SearchMatched => {
281283
assert!(meta.matched_rows.is_some());
282284
let matched_rows = meta.matched_rows.as_ref().unwrap();
283-
284285
let mut bitmap = MutableBitmap::from_len_zeroed(num_rows);
285-
for (idx, _) in matched_rows.iter() {
286-
debug_assert!(*idx < bitmap.len());
286+
for idx in matched_rows.iter() {
287+
debug_assert!(*idx < num_rows);
287288
bitmap.set(*idx, true);
288289
}
289290
Column::Boolean(bitmap.into()).into()
290291
}
291292
InternalColumnType::SearchScore => {
292293
assert!(meta.matched_rows.is_some());
294+
assert!(meta.matched_scores.is_some());
293295
let matched_rows = meta.matched_rows.as_ref().unwrap();
296+
let matched_scores = meta.matched_scores.as_ref().unwrap();
297+
debug_assert_eq!(matched_rows.len(), matched_scores.len());
294298

295299
let mut scores = vec![F32::from(0_f32); num_rows];
296-
for (idx, score) in matched_rows.iter() {
297-
debug_assert!(*idx < scores.len());
300+
for (idx, score) in matched_rows.iter().zip(matched_scores.iter()) {
301+
debug_assert!(*idx < num_rows);
298302
if let Some(val) = scores.get_mut(*idx) {
299-
debug_assert!(score.is_some());
300-
*val = F32::from(*score.unwrap());
303+
*val = *score;
301304
}
302305
}
303306
Float32Type::from_data(scores).into()

src/query/service/tests/it/indexes/inverted_index/index_refresh.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,10 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
196196
)
197197
.await?;
198198
assert!(matched_rows.is_some());
199-
let matched_rows = matched_rows.unwrap();
199+
let (matched_rows, _) = matched_rows.unwrap();
200200
assert_eq!(matched_rows.len(), ids.len());
201201
for (matched_row, id) in matched_rows.iter().zip(ids.iter()) {
202-
assert_eq!(matched_row.0, *id);
202+
assert_eq!(matched_row, id);
203203
}
204204
}
205205

src/query/service/tests/it/storages/fuse/operations/internal_column.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ fn expected_data_block(
6565
base_block_ids: None,
6666
inner: None,
6767
matched_rows: block_meta.matched_rows.clone(),
68+
matched_scores: block_meta.matched_scores.clone(),
6869
vector_scores: block_meta.vector_scores.clone(),
6970
};
7071
for internal_column in internal_columns {

src/query/storages/common/pruner/src/block_meta.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ pub struct BlockMetaIndex {
4141
pub block_location: String,
4242
pub segment_location: String,
4343
pub snapshot_location: Option<String>,
44-
// The search matched rows and optional scores in the block.
45-
pub matched_rows: Option<Vec<(usize, Option<F32>)>>,
44+
// The search matched rows in the block (aligned with `matched_scores` when present).
45+
pub matched_rows: Option<Vec<usize>>,
46+
// Optional scores for the matched rows.
47+
pub matched_scores: Option<Vec<F32>>,
4648
// The vector topn rows and scores in the block.
4749
pub vector_scores: Option<Vec<(usize, F32)>>,
4850
// The optional meta of virtual columns.

src/query/storages/common/pruner/src/topn_pruner.rs

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -237,16 +237,13 @@ impl TopNPruner {
237237

238238
let mut score_stats = Vec::new();
239239
for (pos, (index, _)) in metas.iter().enumerate() {
240-
let Some(rows) = &index.matched_rows else {
241-
continue;
240+
let Some(scores) = &index.matched_scores else {
241+
return Ok(metas);
242242
};
243-
if rows.is_empty() {
244-
continue;
245-
}
246-
let Some((min_score, max_score)) = block_score_range(rows) else {
243+
let Some((min_score, max_score)) = block_score_range(scores) else {
247244
return Ok(metas);
248245
};
249-
score_stats.push((pos, min_score, max_score, rows.len()));
246+
score_stats.push((pos, min_score, max_score, scores.len()));
250247
}
251248

252249
if score_stats.is_empty() {
@@ -302,29 +299,22 @@ fn index_match_count(index: &BlockMetaIndex) -> usize {
302299
0
303300
}
304301

305-
fn block_score_range(rows: &[(usize, Option<F32>)]) -> Option<(F32, F32)> {
306-
let mut min_score: Option<F32> = None;
307-
let mut max_score: Option<F32> = None;
308-
for (_, score) in rows {
309-
let score = (*score)?;
310-
min_score = Some(match min_score {
311-
Some(current) => current.min(score),
312-
None => score,
313-
});
314-
max_score = Some(match max_score {
315-
Some(current) => current.max(score),
316-
None => score,
317-
});
302+
fn block_score_range(scores: &[F32]) -> Option<(F32, F32)> {
303+
if scores.is_empty() {
304+
return None;
318305
}
319-
min_score.zip(max_score)
306+
// Scores are arranged in descending order,
307+
// so we can directly get the maximum and minimum score.
308+
let max_score = scores[0];
309+
let min_score = scores[scores.len() - 1];
310+
Some((min_score, max_score))
320311
}
321312

322313
#[cfg(test)]
323314
mod tests {
324315
use std::collections::HashMap;
325316

326317
use databend_common_expression::types::number::NumberDataType;
327-
use databend_common_expression::types::number::F32;
328318
use databend_common_expression::types::DataType;
329319
use databend_common_expression::ColumnId;
330320
use databend_common_expression::Scalar;
@@ -454,7 +444,7 @@ mod tests {
454444
let matched = if matched_rows == 0 {
455445
None
456446
} else {
457-
Some((0..matched_rows).map(|row| (row, None)).collect::<Vec<_>>())
447+
Some((0..matched_rows).collect::<Vec<_>>())
458448
};
459449

460450
let index = BlockMetaIndex {
@@ -467,6 +457,7 @@ mod tests {
467457
segment_location: "segment".to_string(),
468458
snapshot_location: None,
469459
matched_rows: matched,
460+
matched_scores: None,
470461
vector_scores: None,
471462
virtual_block_meta: None,
472463
};
@@ -482,15 +473,8 @@ mod tests {
482473
scores: &[f32],
483474
) -> (BlockMetaIndex, Arc<BlockMeta>) {
484475
let (mut index, meta) = build_block(column_id, block_id, min, max, scores.len());
485-
let matched_rows = scores
486-
.iter()
487-
.enumerate()
488-
.map(|(row, score)| {
489-
let ordered: F32 = (*score).into();
490-
(row, Some(ordered))
491-
})
492-
.collect();
493-
index.matched_rows = Some(matched_rows);
476+
let matched_scores = scores.iter().map(|v| (*v).into()).collect();
477+
index.matched_scores = Some(matched_scores);
494478
(index, meta)
495479
}
496480
}

src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ impl InvertedIndexReader {
8383
index_record: &IndexRecordOption,
8484
fuzziness: &Option<u8>,
8585
index_loc: &str,
86-
) -> Result<Option<Vec<(usize, Option<F32>)>>> {
86+
) -> Result<Option<(Vec<usize>, Option<Vec<F32>>)>> {
8787
let start = Instant::now();
8888

8989
let matched_rows = self
@@ -113,7 +113,7 @@ impl InvertedIndexReader {
113113
field_ids: &HashSet<u32>,
114114
index_record: &IndexRecordOption,
115115
fuzziness: &Option<u8>,
116-
) -> Result<Option<Vec<(usize, Option<F32>)>>> {
116+
) -> Result<Option<(Vec<usize>, Option<Vec<F32>>)>> {
117117
// read index meta.
118118
let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?;
119119
let version = inverted_index_meta.version;
@@ -158,7 +158,7 @@ impl InvertedIndexReader {
158158
query: Box<dyn Query>,
159159
version: usize,
160160
inverted_index_meta_map: HashMap<String, SingleColumnMeta>,
161-
) -> Result<Option<Vec<(usize, Option<F32>)>>> {
161+
) -> Result<Option<(Vec<usize>, Option<Vec<F32>>)>> {
162162
let directory = load_inverted_index_directory(
163163
settings,
164164
index_path,
@@ -173,30 +173,33 @@ impl InvertedIndexReader {
173173
let reader = index.reader()?;
174174
let searcher = reader.searcher();
175175

176-
let matched_rows = if self.has_score {
176+
let (matched_rows, matched_scores) = if self.has_score {
177177
let collector = TopDocs::with_limit(self.row_count as usize);
178178
let docs = searcher.search(&query, &collector)?;
179179

180180
let mut matched_rows = Vec::with_capacity(docs.len());
181+
let mut matched_scores = Vec::with_capacity(docs.len());
181182
for (score, doc_addr) in docs {
182183
let doc_id = doc_addr.doc_id as usize;
183184
let score = F32::from(score);
184-
matched_rows.push((doc_id, Some(score)));
185+
matched_rows.push(doc_id);
186+
matched_scores.push(score);
185187
}
186-
matched_rows
188+
(matched_rows, Some(matched_scores))
187189
} else {
188190
let collector = DocSetCollector;
189191
let docs = searcher.search(&query, &collector)?;
190192

191193
let mut matched_rows = Vec::with_capacity(docs.len());
192194
for doc_addr in docs {
193195
let doc_id = doc_addr.doc_id as usize;
194-
matched_rows.push((doc_id, None));
196+
matched_rows.push(doc_id);
195197
}
196-
matched_rows
198+
(matched_rows, None)
197199
};
200+
198201
if !matched_rows.is_empty() {
199-
Ok(Some(matched_rows))
202+
Ok(Some((matched_rows, matched_scores)))
200203
} else {
201204
Ok(None)
202205
}
@@ -242,7 +245,7 @@ impl InvertedIndexReader {
242245
index_record: &IndexRecordOption,
243246
fuzziness: &Option<u8>,
244247
mut inverted_index_meta_map: HashMap<String, SingleColumnMeta>,
245-
) -> Result<Option<Vec<(usize, Option<F32>)>>> {
248+
) -> Result<Option<(Vec<usize>, Option<Vec<F32>>)>> {
246249
// 1. read fst and term files.
247250
let mut columns = Vec::with_capacity(field_ids.len() * 2);
248251
for field_id in field_ids {
@@ -479,19 +482,25 @@ impl InvertedIndexReader {
479482

480483
if let Some(matched_doc_ids) = matched_doc_ids {
481484
if !matched_doc_ids.is_empty() {
482-
let mut matched_rows = Vec::with_capacity(matched_doc_ids.len() as usize);
483-
if self.has_score {
485+
let (matched_rows, matched_scores) = if self.has_score {
484486
let scores =
485487
collector.calculate_scores(query.box_clone(), &matched_doc_ids, None)?;
486-
for (doc_id, score) in matched_doc_ids.into_iter().zip(scores.into_iter()) {
487-
matched_rows.push((doc_id as usize, Some(score)));
488-
}
488+
let mut rows_scores = matched_doc_ids
489+
.into_iter()
490+
.zip(scores.into_iter())
491+
.map(|(doc_id, score)| (doc_id as usize, score))
492+
.collect::<Vec<_>>();
493+
rows_scores.sort_by(|a, b| b.1.cmp(&a.1));
494+
let (matched_rows, matched_scores) = rows_scores.into_iter().unzip();
495+
(matched_rows, Some(matched_scores))
489496
} else {
497+
let mut matched_rows = Vec::with_capacity(matched_doc_ids.len() as usize);
490498
for doc_id in matched_doc_ids.into_iter() {
491-
matched_rows.push((doc_id as usize, None));
499+
matched_rows.push(doc_id as usize);
492500
}
493-
}
494-
return Ok(Some(matched_rows));
501+
(matched_rows, None)
502+
};
503+
return Ok(Some((matched_rows, matched_scores)));
495504
}
496505
}
497506
Ok(None)

src/query/storages/fuse/src/operations/read/util.rs

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,46 @@ pub(crate) fn add_data_block_meta(
7070
let block_meta = fuse_part.block_meta_index().unwrap();
7171

7272
// Transform matched_rows indices from block-level to page-level
73-
let matched_rows = block_meta.matched_rows.clone().map(|matched_rows| {
74-
if let Some(offsets) = &offsets {
75-
matched_rows
76-
.into_iter()
77-
.filter(|(idx, _)| offsets.contains(*idx as u64))
78-
.map(|(idx, score)| ((offsets.rank(idx as u64) - 1) as usize, score))
79-
.collect::<Vec<_>>()
80-
} else {
81-
matched_rows
73+
let (matched_rows, matched_scores) = if let Some(offsets) = &offsets {
74+
match (
75+
block_meta.matched_rows.clone(),
76+
block_meta.matched_scores.clone(),
77+
) {
78+
(Some(rows), Some(scores)) => {
79+
debug_assert_eq!(rows.len(), scores.len());
80+
let mut filtered_rows = Vec::with_capacity(rows.len());
81+
let mut filtered_scores = Vec::with_capacity(scores.len());
82+
for (idx, score) in rows.into_iter().zip(scores.into_iter()) {
83+
if offsets.contains(idx as u64) {
84+
let rank = offsets.rank(idx as u64);
85+
debug_assert!(rank > 0);
86+
let new_idx = (rank - 1) as usize;
87+
filtered_rows.push(new_idx);
88+
filtered_scores.push(score);
89+
}
90+
}
91+
(Some(filtered_rows), Some(filtered_scores))
92+
}
93+
(Some(rows), None) => {
94+
let mut filtered_rows = Vec::with_capacity(rows.len());
95+
for idx in rows.into_iter() {
96+
if offsets.contains(idx as u64) {
97+
let rank = offsets.rank(idx as u64);
98+
debug_assert!(rank > 0);
99+
let new_idx = (rank - 1) as usize;
100+
filtered_rows.push(new_idx);
101+
}
102+
}
103+
(Some(filtered_rows), None)
104+
}
105+
(None, _) => (None, None),
82106
}
83-
});
107+
} else {
108+
(
109+
block_meta.matched_rows.clone(),
110+
block_meta.matched_scores.clone(),
111+
)
112+
};
84113

85114
// Transform vector_scores indices from block-level to page-level
86115
let vector_scores = block_meta.vector_scores.clone().map(|vector_scores| {
@@ -105,6 +134,7 @@ pub(crate) fn add_data_block_meta(
105134
base_block_ids,
106135
inner: meta,
107136
matched_rows,
137+
matched_scores,
108138
vector_scores,
109139
};
110140
meta = Some(Box::new(internal_column_meta));

0 commit comments

Comments
 (0)