diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 086160f..f2f2558 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -18,4 +18,5 @@ jobs: base_url: https://docs.sqlitecloud.io/docs/ database_name: aisearch-action-test.sqlite # only few files for testing - source_files: docs/sqlite-cloud/sdks/php \ No newline at end of file + source_files: docs/sqlite-cloud/sdks/js + only_extensions: "md" \ No newline at end of file diff --git a/action.yaml b/action.yaml index 69ea677..5f4946c 100644 --- a/action.yaml +++ b/action.yaml @@ -15,7 +15,15 @@ inputs: source_files: description: The path of the files, by default it will parse every file recursively starting from the working directory. required: false - default: $(pwd) + default: "./" + only_extensions: + description: Comma-separated list of file extensions to include (e.g., "md,txt,html"). If not set, all supported file types are included. + required: false + default: "" + exclude_extensions: + description: Comma-separated list of file extensions to exclude (e.g., "js,jsx"). If not set, no file types are excluded. + required: false + default: "" hf_model_id: description: The Hugging Face model ID to use for generating embeddings. required: false @@ -79,7 +87,9 @@ runs: sqlite-rag add \ --recursive "${{ inputs.source_files }}" \ --metadata '{"base_url": "${{ inputs.base_url }}"}' \ - --relative-paths + --relative-paths \ + --only "${{ inputs.only_extensions }}" \ + --exclude "${{ inputs.exclude_extensions }}" shell: bash - name: Upload the database to SQLite Cloud diff --git a/search_edge_function_template/aisearch-docs.js b/search_edge_function_template/aisearch-docs.js index b790623..c703af0 100644 --- a/search_edge_function_template/aisearch-docs.js +++ b/search_edge_function_template/aisearch-docs.js @@ -16,10 +16,10 @@ const sqliteAIBaseUrl = "https://aiserver.vital-rhino.eks.euc1.ryujaz.sqlite.clo const sqliteAIAPI = "/v1/ai/embeddings" //----------------------- -const requestid = request.params.requestid; const query = request.params.query; +const limit = parseInt(request.params.limit) || 10; // Number of top results to return -// get embedding from sqlite-ai-server +// Get embedding from sqlite-ai-server const data = {"text": query }; const response = await fetch(sqliteAIBaseUrl + sqliteAIAPI, { method: "POST", @@ -36,26 +36,21 @@ if (!response.ok) { const result = await response.json(); const query_embedding = result.data.embedding; -// clean query for full-text search +// Clean query for full-text search const query_fts = (query.toLowerCase().match(/\b\w+\b/g) || []).join(" ") + "*"; -// --- TEST --- -//const test_embedding = await connection.sql('SELECT embedding FROM chunks LIMIT 1;'); -//const query_embedding = test_embedding[0].embedding; -// ------------ - // Vector configuration must match the embedding parameters used during database generation await connection.sql("SELECT vector_init('chunks', 'embedding', 'type=INT8,dimension=768,distance=cosine')"); const res = await connection.sql( ` - -- sqlite-vector KNN vector search results + -- sqlite-vector KNN vector search results WITH vec_matches AS ( SELECT v.rowid AS chunk_id, row_number() OVER (ORDER BY v.distance) AS rank_number, v.distance - FROM vector_quantize_scan('chunks', 'embedding', ?, 10) AS v + FROM vector_quantize_scan('chunks', 'embedding', ?, ?) AS v ), -- Full-text search results fts_matches AS ( @@ -65,7 +60,7 @@ const res = await connection.sql( rank AS score FROM chunks_fts WHERE chunks_fts MATCH ? - LIMIT 10 + LIMIT ? ), -- combine FTS5 + vector search results with RRF matches AS ( @@ -84,28 +79,70 @@ const res = await connection.sql( FULL OUTER JOIN fts_matches ON vec_matches.chunk_id = fts_matches.chunk_id ) - SELECT - documents.id, - documents.uri, - documents.content as document_content, - documents.metadata, - chunks.content AS snippet, - vec_rank, - fts_rank, - combined_rank, - vec_distance, - fts_score - FROM matches - JOIN chunks ON chunks.id = matches.chunk_id - JOIN documents ON documents.id = chunks.document_id + SELECT + documents.id, + documents.uri, + documents.content as document_content, + documents.metadata, + chunks.content AS snippet, + vec_rank, + fts_rank, + combined_rank, + vec_distance, + fts_score + FROM matches + JOIN chunks ON chunks.id = matches.chunk_id + JOIN documents ON documents.id = chunks.document_id ORDER BY combined_rank DESC ; - `, query_embedding, query_fts) + `, query_embedding, limit, query_fts, limit) + +// The results from the query may contain multiple resulting chunks per document. +// We want to return one result per document, so we will group by document id and take +// the top-ranked chunk as a snippet. +const documentsChunk = new Map(); +res.forEach(item => { + if (!documentsChunk.has(item.id) || item.combined_rank > documentsChunk.get(item.id).combined_rank) { + documentsChunk.set(item.id, item); + } +}); +const topResults = Array.from(documentsChunk.values()).slice(0, limit); +// ----- URLs for results ----- +// Customize this section based on how URLs should be constructed for your documents. +// This example uses 'base_url' from metadata and 'slug' if available, otherwise derives from URI. +// ---------------------------- +const resultsWithUrls = topResults + .map(item => { + const metadata = JSON.parse(item.metadata); + const baseUrl = metadata.base_url; + const slug = metadata.extracted?.slug; + const uri = item.uri; + + let fullUrl; + if (slug) { + fullUrl = `${baseUrl}${slug}`; + } else { + const uriWithoutExtension = uri + .toLowerCase() + .replace(/\.(mdx?|md)$/i, ''); + fullUrl = `${baseUrl}${uriWithoutExtension}`; + } + + return { + id: item.id, + url: fullUrl, + title: metadata.extracted?.title || metadata.generated?.title, + snippet: item.snippet, + }; + }); return { data: { - search: res, - requestid: requestid + /** + * @type {Array<{id: number, url: string, title: string, snippet: string}>} + * The search results with constructed URLs, titles, and snippets. + */ + search: resultsWithUrls } }