Skip to content

Commit 423e7ab

Browse files
authored
Add validate_docs parameter to ingest settings (#5984)
1 parent e02a9fd commit 423e7ab

File tree

7 files changed

+68
-33
lines changed

7 files changed

+68
-33
lines changed

quickwit/quickwit-common/src/lib.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,18 @@ pub use socket_addr_legacy_hash::SocketAddrLegacyHash;
6565
pub use stream_utils::{BoxStream, ServiceStream};
6666
use tracing::{error, info};
6767

68+
/// Returns true at compile time. This function is mostly used with serde to initialize boolean
69+
/// fields to true.
70+
pub const fn true_fn() -> bool {
71+
true
72+
}
73+
74+
/// Returns whether the given boolean value is true. This function is mostly used with serde to skip
75+
/// serializing boolean fields with `skip_serializing_if = "is_true"` when the value is true.
76+
pub fn is_true(value: &bool) -> bool {
77+
*value
78+
}
79+
6880
pub fn chunk_range(range: Range<usize>, chunk_size: usize) -> impl Iterator<Item = Range<usize>> {
6981
range.clone().step_by(chunk_size).map(move |block_start| {
7082
let block_end = (block_start + chunk_size).min(range.end);

quickwit/quickwit-config/src/index_config/mod.rs

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use chrono::Utc;
2626
use cron::Schedule;
2727
use humantime::parse_duration;
2828
use quickwit_common::uri::Uri;
29+
use quickwit_common::{is_true, true_fn};
2930
use quickwit_doc_mapper::{DocMapper, DocMapperBuilder, DocMapping};
3031
use quickwit_proto::types::IndexId;
3132
use serde::{Deserialize, Serialize};
@@ -170,6 +171,16 @@ pub struct IngestSettings {
170171
#[schema(default = 1, value_type = usize)]
171172
#[serde(default = "IngestSettings::default_min_shards")]
172173
pub min_shards: NonZeroUsize,
174+
/// Whether to validate documents against the current doc mapping during ingestion.
175+
/// Defaults to true. When false, documents will be written directly to the WAL without
176+
/// validation, but might still be rejected during indexing when applying the doc mapping
177+
/// in the doc processor, in that case the documents are dropped and a warning is logged.
178+
///
179+
/// Note that when a source has a VRL transform configured, documents are not validated against
180+
/// the doc mapping during ingestion either.
181+
#[schema(default = true, value_type = bool)]
182+
#[serde(default = "true_fn", skip_serializing_if = "is_true")]
183+
pub validate_docs: bool,
173184
}
174185

175186
impl IngestSettings {
@@ -182,6 +193,7 @@ impl Default for IngestSettings {
182193
fn default() -> Self {
183194
Self {
184195
min_shards: Self::default_min_shards(),
196+
validate_docs: true,
185197
}
186198
}
187199
}
@@ -481,6 +493,7 @@ impl crate::TestableForRegression for IndexConfig {
481493
};
482494
let ingest_settings = IngestSettings {
483495
min_shards: NonZeroUsize::new(12).unwrap(),
496+
validate_docs: true,
484497
};
485498
let search_settings = SearchSettings {
486499
default_search_fields: vec!["message".to_string()],
@@ -942,18 +955,30 @@ mod tests {
942955

943956
#[test]
944957
fn test_ingest_settings_serde() {
945-
let ingest_settings = IngestSettings {
958+
let settings = IngestSettings {
946959
min_shards: NonZeroUsize::MIN,
960+
validate_docs: false,
947961
};
948-
let ingest_settings_yaml = serde_yaml::to_string(&ingest_settings).unwrap();
949-
let ingest_settings_roundtrip: IngestSettings =
950-
serde_yaml::from_str(&ingest_settings_yaml).unwrap();
951-
assert_eq!(ingest_settings, ingest_settings_roundtrip);
962+
let settings_yaml = serde_yaml::to_string(&settings).unwrap();
963+
assert!(settings_yaml.contains("validate_docs"));
964+
965+
let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap();
966+
assert_eq!(settings, expected_settings);
967+
968+
let settings = IngestSettings {
969+
min_shards: NonZeroUsize::MIN,
970+
validate_docs: true,
971+
};
972+
let settings_yaml = serde_yaml::to_string(&settings).unwrap();
973+
assert!(!settings_yaml.contains("validate_docs"));
974+
975+
let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap();
976+
assert_eq!(settings, expected_settings);
952977

953-
let ingest_settings_yaml = r#"
978+
let settings_yaml = r#"
954979
min_shards: 0
955980
"#;
956-
let error = serde_yaml::from_str::<IngestSettings>(ingest_settings_yaml).unwrap_err();
981+
let error = serde_yaml::from_str::<IngestSettings>(settings_yaml).unwrap_err();
957982
assert!(error.to_string().contains("expected a nonzero"));
958983
}
959984
}

quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,11 +789,13 @@ impl IngestController {
789789
let index_metadata = model
790790
.index_metadata(&source_uid.index_uid)
791791
.expect("index should exist");
792-
let validate_docs = model
792+
let has_transform = model
793793
.source_metadata(source_uid)
794794
.expect("source should exist")
795795
.transform_config
796-
.is_none();
796+
.is_some();
797+
let validate_docs =
798+
index_metadata.index_config.ingest_settings.validate_docs && !has_transform;
797799
let doc_mapping = &index_metadata.index_config.doc_mapping;
798800
let doc_mapping_uid = doc_mapping.doc_mapping_uid;
799801
let doc_mapping_json = serde_utils::to_json_str(doc_mapping)?;

quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,12 @@
1313
// limitations under the License.
1414

1515
use indexmap::IndexSet;
16+
use quickwit_common::true_fn;
1617
use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat, TantivyDateTime};
1718
use serde::{Deserialize, Deserializer, Serialize};
1819
use serde_json::Value as JsonValue;
1920
use tantivy::schema::{DateTimePrecision, OwnedValue as TantivyValue};
2021

21-
use super::default_as_true;
22-
2322
/// A struct holding DateTime field options.
2423
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
2524
#[serde(deny_unknown_fields)]
@@ -41,10 +40,10 @@ pub struct QuickwitDateTimeOptions {
4140
#[serde(alias = "precision")]
4241
pub fast_precision: DateTimePrecision,
4342

44-
#[serde(default = "default_as_true")]
43+
#[serde(default = "true_fn")]
4544
pub indexed: bool,
4645

47-
#[serde(default = "default_as_true")]
46+
#[serde(default = "true_fn")]
4847
pub stored: bool,
4948

5049
#[serde(default)]

quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use std::convert::TryFrom;
1818
use anyhow::bail;
1919
use base64::prelude::{BASE64_STANDARD, Engine};
2020
use once_cell::sync::Lazy;
21+
use quickwit_common::true_fn;
2122
use regex::Regex;
2223
use serde::{Deserialize, Serialize};
2324
use serde_json::Value as JsonValue;
@@ -26,8 +27,8 @@ use tantivy::schema::{
2627
TextOptions, Type,
2728
};
2829

30+
use super::FieldMappingType;
2931
use super::date_time_type::QuickwitDateTimeOptions;
30-
use super::{FieldMappingType, default_as_true};
3132
use crate::doc_mapper::field_mapping_type::QuickwitFieldType;
3233
use crate::{Cardinality, QW_RESERVED_FIELD_NAMES};
3334

@@ -85,13 +86,13 @@ pub struct QuickwitNumericOptions {
8586
#[serde(default)]
8687
#[serde(skip_serializing_if = "Option::is_none")]
8788
pub description: Option<String>,
88-
#[serde(default = "default_as_true")]
89+
#[serde(default = "true_fn")]
8990
pub stored: bool,
90-
#[serde(default = "default_as_true")]
91+
#[serde(default = "true_fn")]
9192
pub indexed: bool,
9293
#[serde(default)]
9394
pub fast: bool,
94-
#[serde(default = "default_as_true")]
95+
#[serde(default = "true_fn")]
9596
pub coerce: bool,
9697
#[serde(default)]
9798
pub output_format: NumericOutputFormat,
@@ -116,9 +117,9 @@ pub struct QuickwitBoolOptions {
116117
#[serde(default)]
117118
#[serde(skip_serializing_if = "Option::is_none")]
118119
pub description: Option<String>,
119-
#[serde(default = "default_as_true")]
120+
#[serde(default = "true_fn")]
120121
pub stored: bool,
121-
#[serde(default = "default_as_true")]
122+
#[serde(default = "true_fn")]
122123
pub indexed: bool,
123124
#[serde(default)]
124125
pub fast: bool,
@@ -144,10 +145,10 @@ pub struct QuickwitBytesOptions {
144145
#[serde(skip_serializing_if = "Option::is_none")]
145146
pub description: Option<String>,
146147
/// If true, the field will be stored in the doc store.
147-
#[serde(default = "default_as_true")]
148+
#[serde(default = "true_fn")]
148149
pub stored: bool,
149150
/// If true, the field will be indexed.
150-
#[serde(default = "default_as_true")]
151+
#[serde(default = "true_fn")]
151152
pub indexed: bool,
152153
/// If true, the field will be stored in columnar format.
153154
#[serde(default)]
@@ -245,9 +246,9 @@ pub struct QuickwitIpAddrOptions {
245246
#[serde(default)]
246247
#[serde(skip_serializing_if = "Option::is_none")]
247248
pub description: Option<String>,
248-
#[serde(default = "default_as_true")]
249+
#[serde(default = "true_fn")]
249250
pub stored: bool,
250-
#[serde(default = "default_as_true")]
251+
#[serde(default = "true_fn")]
251252
pub indexed: bool,
252253
#[serde(default)]
253254
pub fast: bool,
@@ -433,7 +434,7 @@ pub struct QuickwitTextOptions {
433434
deserializer = TextIndexingOptions::from_parts_text,
434435
serializer = TextIndexingOptions::to_parts_text,
435436
fields = (
436-
#[serde(default = "default_as_true")]
437+
#[serde(default = "true_fn")]
437438
pub indexed: bool,
438439
#[serde(default)]
439440
#[serde(skip_serializing_if = "Option::is_none")]
@@ -447,7 +448,7 @@ pub struct QuickwitTextOptions {
447448
),
448449
)]
449450
pub indexing_options: Option<TextIndexingOptions>,
450-
#[serde(default = "default_as_true")]
451+
#[serde(default = "true_fn")]
451452
pub stored: bool,
452453
#[serde(default)]
453454
pub fast: FastFieldOptions,
@@ -577,7 +578,7 @@ pub struct QuickwitJsonOptions {
577578
serializer = TextIndexingOptions::to_parts_json,
578579
fields = (
579580
/// If true, all of the element in the json object will be indexed.
580-
#[serde(default = "default_as_true")]
581+
#[serde(default = "true_fn")]
581582
pub indexed: bool,
582583
/// Sets the tokenize that should be used with the text fields in the
583584
/// json object.
@@ -597,10 +598,10 @@ pub struct QuickwitJsonOptions {
597598
/// Options for indexing text in a Json field.
598599
pub indexing_options: Option<TextIndexingOptions>,
599600
/// If true, the field will be stored in the doc store.
600-
#[serde(default = "default_as_true")]
601+
#[serde(default = "true_fn")]
601602
pub stored: bool,
602603
/// If true, the '.' in json keys will be expanded.
603-
#[serde(default = "default_as_true")]
604+
#[serde(default = "true_fn")]
604605
pub expand_dots: bool,
605606
/// If true, the json object will be stored in columnar format.
606607
#[serde(default)]

quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,6 @@ pub(crate) use tokenizer_entry::{
4848
};
4949
pub use tokenizer_entry::{TokenizerConfig, TokenizerEntry, analyze_text};
5050

51-
/// Function used with serde to initialize boolean value at true if there is no value in json.
52-
fn default_as_true() -> bool {
53-
true
54-
}
55-
5651
pub type Partition = u64;
5752

5853
/// An alias for serde_json's object type.

quickwit/quickwit-metastore/src/tests/index.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ pub async fn test_metastore_update_ingest_settings<
160160

161161
let ingest_settings = IngestSettings {
162162
min_shards: NonZeroUsize::new(12).unwrap(),
163+
..Default::default()
163164
};
164165
let index_update_request = UpdateIndexRequest::try_from_updates(
165166
index_uid.clone(),

0 commit comments

Comments
 (0)