diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 39572d7..8dfb101 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -43,7 +43,16 @@ jobs: continue-on-error: true - name: Build libraries + id: build run: npx nx affected -t build + continue-on-error: true + + - name: Retry build with cache reset + if: steps.build.outcome == 'failure' + run: | + echo "Build failed, resetting NX cache and retrying..." + npx nx reset + npx nx affected -t build --skip-nx-cache - name: Test libraries run: npx nx affected -t test --passWithNoTests diff --git a/docs/draft/docs/libraries/vectoriadb.mdx b/docs/draft/docs/libraries/vectoriadb.mdx index cb41ab5..8e58a41 100644 --- a/docs/draft/docs/libraries/vectoriadb.mdx +++ b/docs/draft/docs/libraries/vectoriadb.mdx @@ -53,6 +53,12 @@ const db = new VectoriaDB({ Uses Hugging Face transformers for higher quality embeddings. +**Note:** Transformer embeddings require installing the optional dependency: + +```bash +npm install @huggingface/transformers +``` + ```typescript const db = new VectoriaDB({ embedding: 'transformer', diff --git a/docs/live/docs/libraries/vectoriadb.mdx b/docs/live/docs/libraries/vectoriadb.mdx index cb41ab5..8e58a41 100644 --- a/docs/live/docs/libraries/vectoriadb.mdx +++ b/docs/live/docs/libraries/vectoriadb.mdx @@ -53,6 +53,12 @@ const db = new VectoriaDB({ Uses Hugging Face transformers for higher quality embeddings. +**Note:** Transformer embeddings require installing the optional dependency: + +```bash +npm install @huggingface/transformers +``` + ```typescript const db = new VectoriaDB({ embedding: 'transformer', diff --git a/libs/enclave-vm/eslint.config.mjs b/libs/enclave-vm/eslint.config.mjs index c334bc0..e4b94fd 100644 --- a/libs/enclave-vm/eslint.config.mjs +++ b/libs/enclave-vm/eslint.config.mjs @@ -9,6 +9,7 @@ export default [ 'error', { ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], + ignoredDependencies: ['@huggingface/transformers'], // Optional peer dependency loaded dynamically }, ], }, diff --git a/libs/enclave-vm/package.json b/libs/enclave-vm/package.json index 0b1d8ff..cd498bc 100644 --- a/libs/enclave-vm/package.json +++ b/libs/enclave-vm/package.json @@ -42,11 +42,8 @@ "astring": "1.9.0", "zod": "^4.1.13" }, - "optionalDependencies": { - "@huggingface/transformers": "^3.0.0" - }, "peerDependencies": { - "@huggingface/transformers": "^3.0.0" + "@huggingface/transformers": "^3.2.2" }, "peerDependenciesMeta": { "@huggingface/transformers": { diff --git a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts index 002f28d..c564496 100644 --- a/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts +++ b/libs/enclave-vm/src/scoring/scorers/local-llm.scorer.ts @@ -103,8 +103,10 @@ export class LocalLlmScorer extends BaseScorer { */ private async _initialize(): Promise { try { - // Dynamic import to avoid bundling issues - const { pipeline } = await import('@huggingface/transformers'); + // Dynamic import using Function to avoid TypeScript checking for the optional dependency + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); + const { pipeline } = transformers; // Use feature-extraction pipeline for both modes // (classification mode uses embeddings + heuristics, similarity mode uses embeddings + VectoriaDB) diff --git a/libs/vectoriadb/eslint.config.mjs b/libs/vectoriadb/eslint.config.mjs index 5dccf30..5348377 100644 --- a/libs/vectoriadb/eslint.config.mjs +++ b/libs/vectoriadb/eslint.config.mjs @@ -24,6 +24,7 @@ export default [ 'error', { ignoredFiles: ['{projectRoot}/eslint.config.{js,cjs,mjs,ts,cts,mts}'], + ignoredDependencies: ['@huggingface/transformers'], // Optional peer dependency loaded dynamically }, ], }, diff --git a/libs/vectoriadb/jest.setup.ts b/libs/vectoriadb/jest.setup.ts index 4e9916f..72e44d6 100644 --- a/libs/vectoriadb/jest.setup.ts +++ b/libs/vectoriadb/jest.setup.ts @@ -1,79 +1,89 @@ /** * Jest setup file for vectoria tests - * Mocks the transformers.js library to avoid ONNX Runtime issues in test environment + * Injects a mock transformers module to avoid ONNX Runtime issues in test environment */ -// Mock the transformers pipeline -jest.mock('@huggingface/transformers', () => { - // Helper to extract and normalize words from text - const extractWords = (text: string): string[] => { - return text - .toLowerCase() - .replace(/[^\w\s]/g, '') - .split(/\s+/) - .filter((word) => word.length > 2); // Filter out short words like "a", "an", "in" - }; +import { EmbeddingService } from './src/embedding.service'; - // Normalize word forms to their root (simple stemming) - const normalizeWord = (word: string): string => { - // Simple stemming - remove common suffixes - return word.replace(/ing$/, '').replace(/s$/, '').replace(/ed$/, '').replace(/er$/, ''); - }; +// Helper to extract and normalize words from text +const extractWords = (text: string): string[] => { + return text + .toLowerCase() + .replace(/[^\w\s]/g, '') + .split(/\s+/) + .filter((word) => word.length > 2); // Filter out short words like "a", "an", "in" +}; - // Create a mock pipeline function that returns consistent embeddings - const createMockPipeline = () => { - return async (text: string | string[]) => { - const textStr = text.toString(); - const words = extractWords(textStr); - const normalizedWords = words.map(normalizeWord); +// Normalize word forms to their root (simple stemming) +const normalizeWord = (word: string): string => { + // Simple stemming - remove common suffixes + return word.replace(/ing$/, '').replace(/s$/, '').replace(/ed$/, '').replace(/er$/, ''); +}; - // Create a 384-dimensional embedding (matching all-MiniLM-L6-v2) - const embedding = new Float32Array(384); +// Create a mock pipeline function that returns consistent embeddings +const createMockPipeline = () => { + return async (text: string | string[]) => { + const textStr = text.toString(); + const words = extractWords(textStr); + const normalizedWords = words.map(normalizeWord); - // Each normalized word contributes to specific dimensions - // This ensures that texts with overlapping words have high similarity - normalizedWords.forEach((word) => { - // Calculate which dimensions this word affects - const wordHash = word.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + // Create a 384-dimensional embedding (matching all-MiniLM-L6-v2) + const embedding = new Float32Array(384); - // Each word contributes to ~50 dimensions centered around its hash position - for (let offset = -25; offset < 25; offset++) { - const dim = (wordHash + offset) % 384; - // Use Gaussian-like contribution - const contribution = Math.exp(-(offset * offset) / 100); - embedding[dim] += contribution; - } - }); + // Each normalized word contributes to specific dimensions + // This ensures that texts with overlapping words have high similarity + normalizedWords.forEach((word) => { + // Calculate which dimensions this word affects + const wordHash = word.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); - // Add small random-like component for uniqueness - for (let i = 0; i < 384; i++) { - // Deterministic "noise" based on text length and position - const noise = Math.sin(textStr.length * 100 + i) * 0.01; - embedding[i] += noise; + // Each word contributes to ~50 dimensions centered around its hash position + for (let offset = -25; offset < 25; offset++) { + const dim = (wordHash + offset) % 384; + // Use Gaussian-like contribution + const contribution = Math.exp(-(offset * offset) / 100); + embedding[dim] += contribution; } + }); - // Normalize the embedding to unit length (as transformers.js does) - let norm = 0; - for (let i = 0; i < embedding.length; i++) { - norm += embedding[i] * embedding[i]; - } - norm = Math.sqrt(norm); + // Add small random-like component for uniqueness + for (let i = 0; i < 384; i++) { + // Deterministic "noise" based on text length and position + const noise = Math.sin(textStr.length * 100 + i) * 0.01; + embedding[i] += noise; + } + + // Normalize the embedding to unit length (as transformers.js does) + let norm = 0; + for (let i = 0; i < embedding.length; i++) { + norm += embedding[i] * embedding[i]; + } + norm = Math.sqrt(norm); - if (norm > 0) { - for (let i = 0; i < embedding.length; i++) { - embedding[i] /= norm; - } + if (norm > 0) { + for (let i = 0; i < embedding.length; i++) { + embedding[i] /= norm; } + } - return { - data: embedding, - }; + return { + data: embedding, }; }; +}; - return { - pipeline: jest.fn(async () => { - return createMockPipeline(); - }), - }; +// Create mock transformers module +const mockTransformersModule = { + pipeline: jest.fn(async () => { + return createMockPipeline(); + }), +}; + +// Inject mock transformers module before all tests +beforeAll(() => { + EmbeddingService.setTransformersModule(mockTransformersModule); +}); + +// Clear the mock after all tests +afterAll(() => { + EmbeddingService.clearTransformersModule(); }); diff --git a/libs/vectoriadb/package.json b/libs/vectoriadb/package.json index 44ed5ed..ad7df84 100644 --- a/libs/vectoriadb/package.json +++ b/libs/vectoriadb/package.json @@ -37,9 +37,14 @@ "default": "./dist/src/index.js" } }, - "dependencies": { + "peerDependencies": { "@huggingface/transformers": "^3.2.2" }, + "peerDependenciesMeta": { + "@huggingface/transformers": { + "optional": true + } + }, "devDependencies": { "typescript": "^5.9.3" } diff --git a/libs/vectoriadb/src/embedding.service.ts b/libs/vectoriadb/src/embedding.service.ts index ae2ea15..0850ff9 100644 --- a/libs/vectoriadb/src/embedding.service.ts +++ b/libs/vectoriadb/src/embedding.service.ts @@ -1,10 +1,34 @@ -import { pipeline } from '@huggingface/transformers'; -import { EmbeddingError } from './errors'; +import { EmbeddingError, ConfigurationError } from './errors'; /** * Service for generating embeddings using transformers.js + * + * NOTE: This service requires @huggingface/transformers to be installed. + * Install it with: npm install @huggingface/transformers + * + * For a zero-dependency alternative, use TFIDFEmbeddingService instead. */ export class EmbeddingService { + // Static transformers module for dependency injection (used in testing) + private static _transformersModule: any = null; + + /** + * Inject a transformers module (for testing purposes) + * @internal + */ + static setTransformersModule(module: any): void { + EmbeddingService._transformersModule = module; + } + + /** + * Clear the injected transformers module + * @internal + */ + static clearTransformersModule(): void { + EmbeddingService._transformersModule = null; + } + + // Using 'any' because @huggingface/transformers is an optional dependency private pipeline: any = null; private modelName: string; private cacheDir: string; @@ -17,6 +41,30 @@ export class EmbeddingService { this.cacheDir = cacheDir; } + /** + * Dynamically import @huggingface/transformers + * This allows the package to be optional - only loaded when actually used + */ + private async loadTransformers(): Promise { + // Use injected module if available (for testing) + if (EmbeddingService._transformersModule) { + return EmbeddingService._transformersModule.pipeline; + } + + try { + // Dynamic import - package may not be installed + // Using Function() to bypass TypeScript's static analysis for optional dependency + const transformers = await (Function('return import("@huggingface/transformers")')() as Promise); + return transformers.pipeline; + } catch (_error) { + throw new ConfigurationError( + '@huggingface/transformers is not installed. ' + + 'Install it with: npm install @huggingface/transformers\n' + + 'Or use TFIDFVectoria/TFIDFEmbeddingService for a zero-dependency alternative.', + ); + } + } + /** * Initialize the embedding model */ @@ -36,8 +84,11 @@ export class EmbeddingService { private async _initialize(): Promise { try { + // Dynamically load transformers + const pipelineFn = await this.loadTransformers(); + // Create feature extraction pipeline - this.pipeline = await pipeline('feature-extraction', this.modelName, { + this.pipeline = await pipelineFn('feature-extraction', this.modelName, { // Use local models directory to cache models cache_dir: this.cacheDir, // // Don't require progress bars in production @@ -54,6 +105,9 @@ export class EmbeddingService { this.isInitialized = true; } catch (error) { this.initializationPromise = null; + if (error instanceof ConfigurationError) { + throw error; + } throw new EmbeddingError( `Failed to initialize embedding model: ${error instanceof Error ? error.message : String(error)}`, error instanceof Error ? error : undefined, diff --git a/package.json b/package.json index b3f81d2..b31826e 100644 --- a/package.json +++ b/package.json @@ -19,9 +19,6 @@ "*.{json,css,scss,md,html,yml,yaml}": "prettier --write" }, "private": true, - "dependencies": { - "@huggingface/transformers": "^3.2.2" - }, "devDependencies": { "@eslint/js": "^9.8.0", "@mdx-js/mdx": "^3.1.1",