From adf800b36d62b64223a5d54b73008d7a71dbd280 Mon Sep 17 00:00:00 2001 From: Paula Stachova Date: Mon, 17 Feb 2025 17:33:56 +0100 Subject: [PATCH 1/4] feat: optional field count threshold --- src/schema-analyzer.ts | 11 ++++++ test/bloated.test.ts | 82 ++++++++++++++++++++++++++---------------- 2 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/schema-analyzer.ts b/src/schema-analyzer.ts index d06a1fc..860aaa8 100644 --- a/src/schema-analyzer.ts +++ b/src/schema-analyzer.ts @@ -167,6 +167,7 @@ type AllSchemaParseOptions = { storeValues: boolean; signal?: AbortSignal; storedValuesLengthLimit: number; + distinctFieldsAbortThreshold?: number; }; export type SchemaParseOptions = Partial; @@ -469,6 +470,7 @@ export class SchemaAnalyzer { semanticTypes: SemanticTypeMap; options: AllSchemaParseOptions; documentsAnalyzed = 0; + fieldsCount = 0; schemaAnalysisRoot: SchemaAnalysisRoot = { fields: Object.create(null), count: 0 @@ -508,6 +510,14 @@ export class SchemaAnalyzer { } } + increaseFieldCount() { + if (!this.options.distinctFieldsAbortThreshold) return; + this.fieldsCount++; + if (this.fieldsCount > this.options.distinctFieldsAbortThreshold) { + throw new Error(`Schema analysis aborted: Fields count above ${this.options.distinctFieldsAbortThreshold}`); + } + } + getSemanticType(value: BSONValue, path: string[]) { // Pass value to semantic type detectors, return first match or undefined. const returnValue = Object.entries(this.semanticTypes) @@ -580,6 +590,7 @@ export class SchemaAnalyzer { count: 0, types: Object.create(null) }; + this.increaseFieldCount(); } const field = schema[fieldName]; diff --git a/test/bloated.test.ts b/test/bloated.test.ts index 0793ba4..8d40a7e 100644 --- a/test/bloated.test.ts +++ b/test/bloated.test.ts @@ -14,40 +14,60 @@ function generateRandomString(length: number) { } describe('bloated documents', function() { - it('really long string is cropped', async function() { - const documents = [{ - str: generateRandomString(20000) - }]; - const schema = await getSchema(documents); - const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; - assert.ok(stringLength <= 10000); - }); + describe('sizeable sample values', function() { + it('really long string is cropped', async function() { + const documents = [{ + str: generateRandomString(20000) + }]; + const schema = await getSchema(documents); + const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; + assert.ok(stringLength <= 10000); + }); - it('really long code is cropped', async function() { - const documents = [{ - code: new Code(generateRandomString(20000)) - }]; - const schema = await getSchema(documents); - const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length; - assert.ok(codeLength <= 10000); - }); + it('really long code is cropped', async function() { + const documents = [{ + code: new Code(generateRandomString(20000)) + }]; + const schema = await getSchema(documents); + const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length; + assert.ok(codeLength <= 10000); + }); + + it('really long binary is cropped', async function() { + const documents = [{ + binData: new Binary(Buffer.from(generateRandomString(20000)), 2) + }]; + const schema = await getSchema(documents); + const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary); + assert.ok(binary.length() <= 10000); + assert.strictEqual(binary.sub_type, 2); + }); - it('really long binary is cropped', async function() { - const documents = [{ - binData: new Binary(Buffer.from(generateRandomString(20000)), 2) - }]; - const schema = await getSchema(documents); - const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary); - assert.ok(binary.length() <= 10000); - assert.strictEqual(binary.sub_type, 2); + it('the limit is configurable', async function() { + const documents = [{ + str: generateRandomString(20000) + }]; + const schema = await getSchema(documents, { storedValuesLengthLimit: 5 }); + const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; + assert.ok(stringLength === 5); + }); }); - it('the limit is configurable', async function() { - const documents = [{ - str: generateRandomString(20000) - }]; - const schema = await getSchema(documents, { storedValuesLengthLimit: 5 }); - const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; - assert.ok(stringLength === 5); + describe('high complexity', function() { + it('aborts after reaching the given limit', async function() { + const documents = [{ + field1: 'abc', + field2: 'bca', + field3: 'cba', + field4: 'cab', + field5: 'bac' + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.fail('Analysis did not throw'); + } catch (error) { + assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); + } + }); }); }); From b184df483a6d0a68afff95f17a62faee29180455 Mon Sep 17 00:00:00 2001 From: Paula Stachova Date: Tue, 18 Feb 2025 17:07:12 +0100 Subject: [PATCH 2/4] add test --- test/bloated.test.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/bloated.test.ts b/test/bloated.test.ts index 8d40a7e..55b8fc1 100644 --- a/test/bloated.test.ts +++ b/test/bloated.test.ts @@ -69,5 +69,23 @@ describe('bloated documents', function() { assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); } }); + + it('aborts after reaching the given limit - nested', async function() { + const documents = [{ + field1: { + field2: { + field3: 'abc', + field4: 'bca' + }, + field5: 'cab' + } + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.fail('Analysis did not throw'); + } catch (error) { + assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); + } + }); }); }); From 3da167f4434ef58d45959f33f58fc72150b2a02c Mon Sep 17 00:00:00 2001 From: Paula Stachova Date: Tue, 18 Feb 2025 17:14:22 +0100 Subject: [PATCH 3/4] another test --- test/bloated.test.ts | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/bloated.test.ts b/test/bloated.test.ts index 55b8fc1..8c96793 100644 --- a/test/bloated.test.ts +++ b/test/bloated.test.ts @@ -87,5 +87,27 @@ describe('bloated documents', function() { assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); } }); + + it('does not count the same field in different documents', async function() { + const documents = [{ + field1: { + field2: { + field3: 'abc' + } + } + }, { + field1: { + field2: { + field3: 'bca' + } + } + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.ok('Analysis finished'); + } catch (error) { + assert.fail('Analysis aborted unexpectedly'); + } + }); }); }); From e5f92d7b143777635bf3d35a13bd207aa7bd6025 Mon Sep 17 00:00:00 2001 From: Paula Stachova Date: Wed, 19 Feb 2025 09:58:25 +0100 Subject: [PATCH 4/4] comment --- src/schema-analyzer.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/schema-analyzer.ts b/src/schema-analyzer.ts index 860aaa8..631cefa 100644 --- a/src/schema-analyzer.ts +++ b/src/schema-analyzer.ts @@ -167,6 +167,8 @@ type AllSchemaParseOptions = { storeValues: boolean; signal?: AbortSignal; storedValuesLengthLimit: number; + /** Complexity limit: + * The analysis will be aborted if the threshold is exceeded. */ distinctFieldsAbortThreshold?: number; }; export type SchemaParseOptions = Partial;