From 9938b106bf98a9b8a612b751bb0481a7ce9956e8 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Thu, 11 Dec 2025 19:41:34 +0200 Subject: [PATCH 1/7] feat: make @huggingface/transformers optional and improve error handling in EmbeddingService --- libs/enclave-vm/package.json | 3 -- libs/vectoriadb/package.json | 9 ++++-- libs/vectoriadb/src/embedding.service.ts | 36 ++++++++++++++++++++++-- package.json | 3 -- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/libs/enclave-vm/package.json b/libs/enclave-vm/package.json index 0b1d8ff..1b306c9 100644 --- a/libs/enclave-vm/package.json +++ b/libs/enclave-vm/package.json @@ -42,9 +42,6 @@ "astring": "1.9.0", "zod": "^4.1.13" }, - "optionalDependencies": { - "@huggingface/transformers": "^3.0.0" - }, "peerDependencies": { "@huggingface/transformers": "^3.0.0" }, diff --git a/libs/vectoriadb/package.json b/libs/vectoriadb/package.json index 44ed5ed..18fda94 100644 --- a/libs/vectoriadb/package.json +++ b/libs/vectoriadb/package.json @@ -37,8 +37,13 @@ "default": "./dist/src/index.js" } }, - "dependencies": { - "@huggingface/transformers": "^3.2.2" + "peerDependencies": { + "@huggingface/transformers": "^3.0.0" + }, + "peerDependenciesMeta": { + "@huggingface/transformers": { + "optional": true + } }, "devDependencies": { "typescript": "^5.9.3" diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index ae2ea15..0646ad2 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -1,8 +1,15 @@ -import { pipeline } from '@huggingface/transformers'; -import { EmbeddingError } from './errors'; +import { EmbeddingError, ConfigurationError } from './errors'; + +// Dynamic import type for @huggingface/transformers +type PipelineFunction = typeof import('@huggingface/transformers').pipeline; /** * Service for generating embeddings using transformers.js + * + * NOTE: This service requires @huggingface/transformers to be installed. + * Install it with: npm install @huggingface/transformers + * + * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { private pipeline: any = null; @@ -17,6 +24,23 @@ export class EmbeddingService { this.cacheDir = cacheDir; } + /** + * Dynamically import @huggingface/transformers + * This allows the package to be optional - only loaded when actually used + */ + private async loadTransformers(): Promise { + try { + const transformers = await import('@huggingface/transformers'); + return transformers.pipeline; + } catch (error) { + throw new ConfigurationError( + '@huggingface/transformers is not installed. ' + + 'Install it with: npm install @huggingface/transformers\n' + + 'Or use TFIDFVectoria/TFIDFEmbeddingService for a zero-dependency alternative.', + ); + } + } + /** * Initialize the embedding model */ @@ -36,8 +60,11 @@ export class EmbeddingService { private async _initialize(): Promise { try { + // Dynamically load transformers + const pipelineFn = await this.loadTransformers(); + // Create feature extraction pipeline - this.pipeline = await pipeline('feature-extraction', this.modelName, { + this.pipeline = await pipelineFn('feature-extraction', this.modelName, { // Use local models directory to cache models cache_dir: this.cacheDir, // // Don't require progress bars in production @@ -54,6 +81,9 @@ export class EmbeddingService { this.isInitialized = true; } catch (error) { this.initializationPromise = null; + if (error instanceof ConfigurationError) { + throw error; + } throw new EmbeddingError( `Failed to initialize embedding model: ${error instanceof Error ? error.message : String(error)}`, error instanceof Error ? error : undefined, diff --git a/package.json b/package.json index b3f81d2..b31826e 100644 --- a/package.json +++ b/package.json @@ -19,9 +19,6 @@ "*.{json,css,scss,md,html,yml,yaml}": "prettier --write" }, "private": true, - "dependencies": { - "@huggingface/transformers": "^3.2.2" - }, "devDependencies": { "@eslint/js": "^9.8.0", "@mdx-js/mdx": "^3.1.1", From d530d9d1c7a1827d7ccbf44acbda1d5e2ee7bb09 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Thu, 11 Dec 2025 20:09:46 +0200 Subject: [PATCH 2/7] fix: update @huggingface/transformers peer dependency to ^3.2.2 and refine pipeline type --- libs/vectoriadb/package.json | 2 +- libs/vectoriadb/src/embedding.service.ts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/vectoriadb/package.json b/libs/vectoriadb/package.json index 18fda94..ad7df84 100644 --- a/libs/vectoriadb/package.json +++ b/libs/vectoriadb/package.json @@ -38,7 +38,7 @@ } }, "peerDependencies": { - "@huggingface/transformers": "^3.0.0" + "@huggingface/transformers": "^3.2.2" }, "peerDependenciesMeta": { "@huggingface/transformers": { diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index 0646ad2..a5cbbf1 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -2,6 +2,7 @@ import { EmbeddingError, ConfigurationError } from './errors'; // Dynamic import type for @huggingface/transformers type PipelineFunction = typeof import('@huggingface/transformers').pipeline; +type FeatureExtractionPipeline = Awaited>; /** * Service for generating embeddings using transformers.js @@ -12,7 +13,7 @@ type PipelineFunction = typeof import('@huggingface/transformers').pipeline; * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { - private pipeline: any = null; + private pipeline: FeatureExtractionPipeline | null = null; private modelName: string; private cacheDir: string; private dimensions = 384; // default for all-MiniLM-L6-v2 From 84d49b68c4547d2d2fdd6807986d6c18fdaf0013 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Thu, 11 Dec 2025 20:10:18 +0200 Subject: [PATCH 3/7] fix: change pipeline type to 'any' in EmbeddingService for compatibility with complex return types --- libs/vectoriadb/src/embedding.service.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index a5cbbf1..06519f3 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -2,7 +2,6 @@ import { EmbeddingError, ConfigurationError } from './errors'; // Dynamic import type for @huggingface/transformers type PipelineFunction = typeof import('@huggingface/transformers').pipeline; -type FeatureExtractionPipeline = Awaited>; /** * Service for generating embeddings using transformers.js @@ -13,7 +12,9 @@ type FeatureExtractionPipeline = Awaited>; * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { - private pipeline: FeatureExtractionPipeline | null = null; + // Using 'any' because the pipeline return type is a complex union that TypeScript cannot represent + // eslint-disable-next-line @typescript-eslint/no-explicit-any + private pipeline: any = null; private modelName: string; private cacheDir: string; private dimensions = 384; // default for all-MiniLM-L6-v2 From fddc17df7ffc2c1dd898a81faff213f76cafcb87 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Thu, 11 Dec 2025 20:10:30 +0200 Subject: [PATCH 4/7] fix: change pipeline type to 'any' in EmbeddingService for compatibility with complex return types --- docs/draft/docs/libraries/vectoriadb.mdx | 6 ++++++ docs/live/docs/libraries/vectoriadb.mdx | 6 ++++++ libs/enclave-vm/package.json | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/draft/docs/libraries/vectoriadb.mdx b/docs/draft/docs/libraries/vectoriadb.mdx index cb41ab5..8e58a41 100644 --- a/docs/draft/docs/libraries/vectoriadb.mdx +++ b/docs/draft/docs/libraries/vectoriadb.mdx @@ -53,6 +53,12 @@ const db = new VectoriaDB({ Uses Hugging Face transformers for higher quality embeddings. +**Note:** Transformer embeddings require installing the optional dependency: + +```bash +npm install @huggingface/transformers +``` + ```typescript const db = new VectoriaDB({ embedding: 'transformer', diff --git a/docs/live/docs/libraries/vectoriadb.mdx b/docs/live/docs/libraries/vectoriadb.mdx index cb41ab5..8e58a41 100644 --- a/docs/live/docs/libraries/vectoriadb.mdx +++ b/docs/live/docs/libraries/vectoriadb.mdx @@ -53,6 +53,12 @@ const db = new VectoriaDB({ Uses Hugging Face transformers for higher quality embeddings. +**Note:** Transformer embeddings require installing the optional dependency: + +```bash +npm install @huggingface/transformers +``` + ```typescript const db = new VectoriaDB({ embedding: 'transformer', diff --git a/libs/enclave-vm/package.json b/libs/enclave-vm/package.json index 1b306c9..cd498bc 100644 --- a/libs/enclave-vm/package.json +++ b/libs/enclave-vm/package.json @@ -43,7 +43,7 @@ "zod": "^4.1.13" }, "peerDependencies": { - "@huggingface/transformers": "^3.0.0" + "@huggingface/transformers": "^3.2.2" }, "peerDependenciesMeta": { "@huggingface/transformers": { From 7d2927760acd588ad36dafefa0c866f74e70476c Mon Sep 17 00:00:00 2001 From: David Antoon Date: Fri, 12 Dec 2025 01:23:27 +0200 Subject: [PATCH 5/7] fix: add retry mechanism for build step in CI pipeline with cache reset --- .github/workflows/push.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 39572d7..8dfb101 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -43,7 +43,16 @@ jobs: continue-on-error: true - name: Build libraries + id: build run: npx nx affected -t build + continue-on-error: true + + - name: Retry build with cache reset + if: steps.build.outcome == 'failure' + run: | + echo "Build failed, resetting NX cache and retrying..." + npx nx reset + npx nx affected -t build --skip-nx-cache - name: Test libraries run: npx nx affected -t test --passWithNoTests From 1419b7a30315816e68fcaaa109a2d027192a88f4 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Fri, 12 Dec 2025 01:45:45 +0200 Subject: [PATCH 6/7] fix: dynamically import @huggingface/transformers to handle optional dependency --- libs/enclave-vm/eslint.config.mjs | 1 + .../src/scoring/scorers/local-llm.scorer.ts | 6 ++++-- libs/vectoriadb/eslint.config.mjs | 1 + libs/vectoriadb/src/embedding.service.ts | 13 +++++-------- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/libs/enclave-vm/eslint.config.mjs b/libs/enclave-vm/eslint.config.mjs index c334bc0..e4b94fd 100644 --- a/libs/enclave-vm/eslint.config.mjs +++ b/libs/enclave-vm/eslint.config.mjs @@ -9,6 +9,7 @@ export default [ 'error', { ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], + ignoredDependencies: ['@huggingface/transformers'], // Optional peer dependency loaded dynamically }, ], }, diff --git a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts index 002f28d..c564496 100644 --- a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts +++ b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts @@ -103,8 +103,10 @@ export class LocalLlmScorer extends BaseScorer { */ private async _initialize(): Promise { try { - // Dynamic import to avoid bundling issues - const { pipeline } = await import('@huggingface/transformers'); + // Dynamic import using Function to avoid TypeScript checking for the optional dependency + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); + const { pipeline } = transformers; // Use feature-extraction pipeline for both modes // (classification mode uses embeddings + heuristics, similarity mode uses embeddings + VectoriaDB) diff --git a/libs/vectoriadb/eslint.config.mjs b/libs/vectoriadb/eslint.config.mjs index 5dccf30..5348377 100644 --- a/libs/vectoriadb/eslint.config.mjs +++ b/libs/vectoriadb/eslint.config.mjs @@ -24,6 +24,7 @@ export default [ 'error', { ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], + ignoredDependencies: ['@huggingface/transformers'], // Optional peer dependency loaded dynamically }, ], }, diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index 06519f3..2673b50 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -1,8 +1,5 @@ import { EmbeddingError, ConfigurationError } from './errors'; -// Dynamic import type for @huggingface/transformers -type PipelineFunction = typeof import('@huggingface/transformers').pipeline; - /** * Service for generating embeddings using transformers.js * @@ -12,8 +9,7 @@ type PipelineFunction = typeof import('@huggingface/transformers').pipeline; * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { - // Using 'any' because the pipeline return type is a complex union that TypeScript cannot represent - // eslint-disable-next-line @typescript-eslint/no-explicit-any + // Using 'any' because @huggingface/transformers is an optional dependency private pipeline: any = null; private modelName: string; private cacheDir: string; @@ -30,11 +26,12 @@ export class EmbeddingService { * Dynamically import @huggingface/transformers * This allows the package to be optional - only loaded when actually used */ - private async loadTransformers(): Promise { + private async loadTransformers(): Promise { try { - const transformers = await import('@huggingface/transformers'); + // Dynamic import - package may not be installed + const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); return transformers.pipeline; - } catch (error) { + } catch (_error) { throw new ConfigurationError( '@huggingface/transformers is not installed. ' + 'Install it with: npm install @huggingface/transformers\n' + From 0f87d06527e6cc892ff0d700a221b9243235ae47 Mon Sep 17 00:00:00 2001 From: David Antoon Date: Fri, 12 Dec 2025 01:55:40 +0200 Subject: [PATCH 7/7] fix: add dependency injection for transformers module in EmbeddingService for testing --- libs/vectoriadb/jest.setup.ts | 130 ++++++++++++----------- libs/vectoriadb/src/embedding.service.ts | 25 +++++ 2 files changed, 95 insertions(+), 60 deletions(-) diff --git a/libs/vectoriadb/jest.setup.ts b/libs/vectoriadb/jest.setup.ts index 4e9916f..72e44d6 100644 --- a/libs/vectoriadb/jest.setup.ts +++ b/libs/vectoriadb/jest.setup.ts @@ -1,79 +1,89 @@ /** * Jest setup file for vectoria tests - * Mocks the transformers.js library to avoid ONNX Runtime issues in test environment + * Injects a mock transformers module to avoid ONNX Runtime issues in test environment */ -// Mock the transformers pipeline -jest.mock('@huggingface/transformers', () => { - // Helper to extract and normalize words from text - const extractWords = (text: string): string[] => { - return text - .toLowerCase() - .replace(/[^\w\s]/g, '') - .split(/\s+/) - .filter((word) => word.length > 2); // Filter out short words like "a", "an", "in" - }; +import { EmbeddingService } from './src/embedding.service'; - // Normalize word forms to their root (simple stemming) - const normalizeWord = (word: string): string => { - // Simple stemming - remove common suffixes - return word.replace(/ing$/, '').replace(/s$/, '').replace(/ed$/, '').replace(/er$/, ''); - }; +// Helper to extract and normalize words from text +const extractWords = (text: string): string[] => { + return text + .toLowerCase() + .replace(/[^\w\s]/g, '') + .split(/\s+/) + .filter((word) => word.length > 2); // Filter out short words like "a", "an", "in" +}; - // Create a mock pipeline function that returns consistent embeddings - const createMockPipeline = () => { - return async (text: string | string[]) => { - const textStr = text.toString(); - const words = extractWords(textStr); - const normalizedWords = words.map(normalizeWord); +// Normalize word forms to their root (simple stemming) +const normalizeWord = (word: string): string => { + // Simple stemming - remove common suffixes + return word.replace(/ing$/, '').replace(/s$/, '').replace(/ed$/, '').replace(/er$/, ''); +}; - // Create a 384-dimensional embedding (matching all-MiniLM-L6-v2) - const embedding = new Float32Array(384); +// Create a mock pipeline function that returns consistent embeddings +const createMockPipeline = () => { + return async (text: string | string[]) => { + const textStr = text.toString(); + const words = extractWords(textStr); + const normalizedWords = words.map(normalizeWord); - // Each normalized word contributes to specific dimensions - // This ensures that texts with overlapping words have high similarity - normalizedWords.forEach((word) => { - // Calculate which dimensions this word affects - const wordHash = word.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + // Create a 384-dimensional embedding (matching all-MiniLM-L6-v2) + const embedding = new Float32Array(384); - // Each word contributes to ~50 dimensions centered around its hash position - for (let offset = -25; offset < 25; offset++) { - const dim = (wordHash + offset) % 384; - // Use Gaussian-like contribution - const contribution = Math.exp(-(offset * offset) / 100); - embedding[dim] += contribution; - } - }); + // Each normalized word contributes to specific dimensions + // This ensures that texts with overlapping words have high similarity + normalizedWords.forEach((word) => { + // Calculate which dimensions this word affects + const wordHash = word.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); - // Add small random-like component for uniqueness - for (let i = 0; i < 384; i++) { - // Deterministic "noise" based on text length and position - const noise = Math.sin(textStr.length * 100 + i) * 0.01; - embedding[i] += noise; + // Each word contributes to ~50 dimensions centered around its hash position + for (let offset = -25; offset < 25; offset++) { + const dim = (wordHash + offset) % 384; + // Use Gaussian-like contribution + const contribution = Math.exp(-(offset * offset) / 100); + embedding[dim] += contribution; } + }); - // Normalize the embedding to unit length (as transformers.js does) - let norm = 0; - for (let i = 0; i < embedding.length; i++) { - norm += embedding[i] * embedding[i]; - } - norm = Math.sqrt(norm); + // Add small random-like component for uniqueness + for (let i = 0; i < 384; i++) { + // Deterministic "noise" based on text length and position + const noise = Math.sin(textStr.length * 100 + i) * 0.01; + embedding[i] += noise; + } + + // Normalize the embedding to unit length (as transformers.js does) + let norm = 0; + for (let i = 0; i < embedding.length; i++) { + norm += embedding[i] * embedding[i]; + } + norm = Math.sqrt(norm); - if (norm > 0) { - for (let i = 0; i < embedding.length; i++) { - embedding[i] /= norm; - } + if (norm > 0) { + for (let i = 0; i < embedding.length; i++) { + embedding[i] /= norm; } + } - return { - data: embedding, - }; + return { + data: embedding, }; }; +}; - return { - pipeline: jest.fn(async () => { - return createMockPipeline(); - }), - }; +// Create mock transformers module +const mockTransformersModule = { + pipeline: jest.fn(async () => { + return createMockPipeline(); + }), +}; + +// Inject mock transformers module before all tests +beforeAll(() => { + EmbeddingService.setTransformersModule(mockTransformersModule); +}); + +// Clear the mock after all tests +afterAll(() => { + EmbeddingService.clearTransformersModule(); }); diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index 2673b50..0850ff9 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -9,6 +9,25 @@ import { EmbeddingError, ConfigurationError } from './errors'; * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { + // Static transformers module for dependency injection (used in testing) + private static _transformersModule: any = null; + + /** + * Inject a transformers module (for testing purposes) + * @internal + */ + static setTransformersModule(module: any): void { + EmbeddingService._transformersModule = module; + } + + /** + * Clear the injected transformers module + * @internal + */ + static clearTransformersModule(): void { + EmbeddingService._transformersModule = null; + } + // Using 'any' because @huggingface/transformers is an optional dependency private pipeline: any = null; private modelName: string; @@ -27,8 +46,14 @@ export class EmbeddingService { * This allows the package to be optional - only loaded when actually used */ private async loadTransformers(): Promise { + // Use injected module if available (for testing) + if (EmbeddingService._transformersModule) { + return EmbeddingService._transformersModule.pipeline; + } + try { // Dynamic import - package may not be installed + // Using Function() to bypass TypeScript's static analysis for optional dependency const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); return transformers.pipeline; } catch (_error) {