feat: update README and code to support additional embedding dimensions and improve model defaults

jruokola · jruokola · commit 583fe8bb17ae · 2025-11-18T21:27:55.000+02:00
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ CodeGraph now writes Ollama/LM Studio embeddings directly into SurrealDB’s ded
 ```bash
 export CODEGRAPH_EMBEDDING_PROVIDER=ollama
 export CODEGRAPH_EMBEDDING_MODEL=qwen3-embedding:0.6b   # or all-mini-llm, qwen3-embedding:4b, embeddinggemma etc.
-export CODEGRAPH_EMBEDDING_DIMENSION=1024               # 384, 768, 1024, 2048, or 4096
+export CODEGRAPH_EMBEDDING_DIMENSION=1024               # 384, 768, 1024, 1536, 2048, 2560, 3072 or 4096 dimensions supported
 
 # Optional local reranking (LM Studio exposes an OpenAI-compatible reranker endpoint)
 export CODEGRAPH_RERANKING_PROVIDER=lmstudio
@@ -170,7 +170,7 @@ Pick the setup that matches your needs:
 
 **Providers:**
 - **Embeddings:** Jina (You get 10 million tokens for free when you just create an account!)
-- **LLM:** Anthropic Claude or OpenAI GPT-5-*
+- **LLM:** Anthropic Claude or OpenAI GPT-5.1-*
 - **Backend**: SurrealDB graph database (You get a free cloud instance up-to 1gb! Or run it completely locally!)
 
 **Pros:** ✅ Best quality, ✅ Fast, ✅ 1M context (sonnet[1m])
@@ -384,6 +384,7 @@ dimension = 2048
 enabled = true
 provider = "openai"
 model = "gpt-5-codex-mini"
+context_window=200000
 openai_api_key = "sk-..."
 max_completion_token = 128000
 reasoning_effort = "medium"  # reasoning models: "minimal", "medium", "high"
@@ -403,6 +404,8 @@ jina_reranking_model = "jina-reranker-v3"
 enabled = true
 provider = "anthropic"
 model = "claude-haiku"
+context_window = 200000
+max_completion_tokens= 25000
 anthropic_api_key = "sk-ant-..."
 ```
 
@@ -412,7 +415,7 @@ anthropic_api_key = "sk-ant-..."
 provider = "openai"  # or "jina"
 model = "text-embedding-3-small"
 openai_api_key = "sk-..."
-dimension = 2048
+dimension = 1536
 
 [llm]
 enabled = true
@@ -472,6 +475,7 @@ dimension = 384
 enabled = true
 provider = "anthropic"  # Best quality for analysis
 model = "sonnet[1m]"
+context_window = 1000000
 anthropic_api_key = "sk-ant-..."
 ```
 
@@ -522,7 +526,7 @@ model = "haiku"
 anthropic_api_key = "sk-ant-..."
 context_window = 200000
 temperature = 0.1
-max_completion_token = 4096
+max_completion_token = 25000
 
 [performance]
 num_threads = 0  # 0 = auto-detect
@@ -663,8 +667,8 @@ flowchart TD
 
         D -->|< 50K tokens| E1[Small Tier<br/>TERSE prompts<br/>5 max steps<br/>2,048 tokens]
         D -->|50K-150K tokens| E2[Medium Tier<br/>BALANCED prompts<br/>10 max steps<br/>4,096 tokens]
-        D -->|150K-500K tokens| E3[Large Tier<br/>DETAILED prompts<br/>15 max steps<br/>8,192 tokens]
-        D -->|> 500K tokens| E4[Massive Tier<br/>EXPLORATORY prompts<br/>20 max steps<br/>16,384 tokens]
+        D -->|150K-400K tokens| E3[Large Tier<br/>DETAILED prompts<br/>15 max steps<br/>8,192 tokens]
+        D -->|> 400K tokens| E4[Massive Tier<br/>EXPLORATORY prompts<br/>20 max steps<br/>16,384 tokens]
 
         E1 & E2 & E3 & E4 --> F[Load Tier-Specific<br/>System Prompt]
 
@@ -712,10 +716,10 @@ flowchart TD
 **Key Components:**
 
 1. **Tier Detection**: Automatically adapts prompt complexity based on LLM's context window
-   - Small (<50K): Fast, terse responses for limited context models f.ex. local
+   - Small (<50K): Fast, terse responses for limited context models f.ex. local gemma3 etc.
    - Medium (50K-150K): Balanced analysis for Claude Haiku, gpt-5.1-codex-mini
-   - Large (150K-400K): Detailed exploration for Sonnet, Opus, gpt-5.1
-   - Massive (>400K): Comprehensive deep-dives for grok-4-fast, gemini-2.5-pro, Sonnet[1m]
+   - Large (150K-400K): Detailed exploration for Sonnet, Opus, gpt-5.1, qwen3:4b
+   - Massive (>400K): Comprehensive deep-dives for grok-4-fast, gemini-3.0-pro, Sonnet[1m]
 
 2. **Multi-Step Reasoning**: ReAct pattern with tier-specific limits
    - Each step can call internal graph analysis tools
diff --git a/crates/codegraph-ai/src/llm_factory.rs b/crates/codegraph-ai/src/llm_factory.rs
@@ -125,7 +125,7 @@ impl LLMProviderFactory {
             model: config
                 .model
                 .clone()
-                .unwrap_or_else(|| "claude-3-5-sonnet-20241022".to_string()),
+                .unwrap_or_else(|| "claude".to_string()),
             context_window: config.context_window,
             timeout_secs: config.timeout_secs,
             max_retries: 3,
@@ -151,7 +151,7 @@ impl LLMProviderFactory {
         let openai_config = OpenAIConfig {
             api_key,
             base_url: "https://api.openai.com/v1".to_string(),
-            model: config.model.clone().unwrap_or_else(|| "gpt-4o".to_string()),
+            model: config.model.clone().unwrap_or_else(|| "gpt-5.1-codex".to_string()),
             context_window: config.context_window,
             timeout_secs: config.timeout_secs,
             max_retries: 3,
@@ -261,9 +261,9 @@ mod tests {
             provider: "ollama".to_string(),
             model: Some("qwen2.5-coder:14b".to_string()),
             ollama_url: "http://localhost:11434".to_string(),
-            context_window: 128_000,
+            context_window: config.context,
             temperature: 0.1,
-            max_tokens: 4096,
+            max_tokens: config.max_tokens,
             timeout_secs: 120,
             ..Default::default()
         };
diff --git a/crates/codegraph-ai/src/openai_compatible_provider.rs b/crates/codegraph-ai/src/openai_compatible_provider.rs
@@ -31,7 +31,7 @@ impl Default for OpenAICompatibleConfig {
         Self {
             base_url: "http://localhost:1234/v1".to_string(),
             model: "local-model".to_string(),
-            context_window: 256_000,
+            context_window: config.context_window,
             timeout_secs: 120,
             max_retries: 3,
             api_key: None,
@@ -47,7 +47,7 @@ impl OpenAICompatibleConfig {
         Self {
             base_url: "http://localhost:1234/v1".to_string(),
             model,
-            context_window: 256_000,
+            context_window: config.context_window,
             provider_name: "lmstudio".to_string(),
             use_responses_api: false, // LM Studio doesn't support Responses API
             ..Default::default()
@@ -59,7 +59,7 @@ impl OpenAICompatibleConfig {
         Self {
             base_url: "http://localhost:11434/v1".to_string(),
             model,
-            context_window: 256_000,
+            context_window: config.context_window,
             provider_name: "ollama".to_string(),
             use_responses_api: false, // Ollama doesn't support Responses API
             ..Default::default()
diff --git a/crates/codegraph-ai/src/openai_llm_provider.rs b/crates/codegraph-ai/src/openai_llm_provider.rs
@@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
 use std::time::{Duration, Instant};
 
 const OPENAI_API_BASE: &str = "https://api.openai.com/v1";
-const DEFAULT_MODEL: &str = "gpt-4o";
+const DEFAULT_MODEL: &str = "gpt-5.1-codex";
 
 /// Configuration for OpenAI provider
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -15,7 +15,7 @@ pub struct OpenAIConfig {
     pub api_key: String,
     /// Base URL for API (default: https://api.openai.com/v1)
     pub base_url: String,
-    /// Model to use (e.g., "gpt-4o", "o3-mini", "o1")
+    /// Model to use (e.g., "gpt-5.1, gpt-5.1-codex, gpt-5.1-codex-mini)
     pub model: String,
     /// Maximum context window
     pub context_window: usize,
@@ -33,7 +33,7 @@ impl Default for OpenAIConfig {
             api_key: std::env::var("OPENAI_API_KEY").unwrap_or_default(),
             base_url: OPENAI_API_BASE.to_string(),
             model: DEFAULT_MODEL.to_string(),
-            context_window: 400000,
+            context_window: config.context_window,
             timeout_secs: 120,
             max_retries: 3,
             organization: std::env::var("OPENAI_ORG_ID").ok(),
@@ -212,7 +212,7 @@ impl LLMProvider for OpenAIProvider {
         let response = self.send_request(messages, config).await?;
 
         // Extract text from output array
-        // OpenAI GPT-5 returns: output[{type: "message", content: [{type: "output_text", text: "..."}]}]
+        // OpenAI GPT-5.1 returns: output[{type: "message", content: [{type: "output_text", text: "..."}]}]
         let content = response
             .output
             .iter()
@@ -427,7 +427,7 @@ mod tests {
 
     #[test]
     fn test_reasoning_model_detection() {
-        let models = vec!["gpt-5"];
+        let models = vec!["gpt-5.1"];
         for model in models {
             let config = OpenAIConfig {
                 api_key: "test".to_string(),
diff --git a/crates/codegraph-mcp/src/indexer.rs b/crates/codegraph-mcp/src/indexer.rs
@@ -1249,43 +1249,84 @@ impl ProjectIndexer {
         self.flush_surreal_writer().await?;
 
         // COMPREHENSIVE INDEXING COMPLETION SUMMARY
+        let avg_nodes_per_file = if stats.files > 0 {
+            total_nodes_extracted as f64 / stats.files as f64
+        } else {
+            0.0
+        };
+        let avg_edges_per_file = if stats.files > 0 {
+            total_edges_extracted as f64 / stats.files as f64
+        } else {
+            0.0
+        };
+        let avg_embeddings_per_node = if total_nodes_extracted > 0 {
+            stats.embeddings as f64 / total_nodes_extracted as f64
+        } else {
+            0.0
+        };
+
         info!("🎉 INDEXING COMPLETE - REVOLUTIONARY AI DEVELOPMENT PLATFORM READY!");
-        info!("┌─────────────────────────────────────────────────────────────────┐");
-        info!("│ 📊 COMPREHENSIVE INDEXING STATISTICS                           │");
-        info!("├─────────────────────────────────────────────────────────────────┤");
+        info!("┌────────────────────────────────────────────────────────────────────────────┐");
+        info!("│ 📊 COMPREHENSIVE INDEXING STATISTICS                                      │");
+        info!("├────────────────────────────────────────────────────────────────────────────┤");
         info!(
-            "│ 📄 Files processed: {} ({} languages supported)                │",
+            "│ 📂 Files scanned: {:>5} total | {:>5} parsed | {:>5} skipped                │",
+            pstats.total_files,
             stats.files,
-            file_config.languages.len()
+            stats.skipped
         );
         info!(
-            "│ 📝 Lines analyzed: {} (TreeSitter AST parsing)                 │",
-            stats.lines
+            "│ ✅ Parser success: {:>5.1}% ({} / {} files)                               │",
+            success_rate,
+            pstats.parsed_files,
+            pstats.total_files
         );
         info!(
-            "│ 🌳 Semantic nodes: {} (functions: {}, structs: {}, traits: {}) │",
-            total_nodes_extracted, stats.functions, stats.structs, stats.traits
+            "│ 🗣️ Languages targeted: {:>3} | Batch (embed) {:>3} | Concurrency {:>3}        │",
+            file_config.languages.len(),
+            batch,
+            self.config.max_concurrent
         );
         info!(
-            "│ 🔗 Code relationships: {} extracted (calls, imports, deps)     │",
-            total_edges_extracted
+            "│ 📝 Lines analyzed: {:>10} | Avg nodes/file {:>5.1} | Avg deps/file {:>5.1} │",
+            stats.lines,
+            avg_nodes_per_file,
+            avg_edges_per_file
         );
         info!(
-            "│ 💾 Vector embeddings: {} ({}-dim {})                         │",
-            stats.embeddings, self.vector_dim, provider
+            "│ 🌳 Semantic nodes: {:>8} | funcs {:>6} | structs {:>5} | traits {:>5} │",
+            total_nodes_extracted,
+            stats.functions,
+            stats.structs,
+            stats.traits
         );
         info!(
-            "│ 🎯 Dependency resolution: {:.1}% success ({}/{} edges stored)   │",
-            resolution_rate, stored_edges, edge_count
+            "│ 🔗 Dependencies: {:>8} extracted | {:>8} stored (resolved {:.1}%)        │",
+            total_edges_extracted,
+            stored_edges,
+            resolution_rate
+        );
+        info!(
+            "│ 💾 Vector embeddings: {:>8} ({:>4}-dim {}, {:.1} per node)                 │",
+            stats.embeddings,
+            self.vector_dim,
+            provider,
+            avg_embeddings_per_node
+        );
+        info!(
+            "│ 📦 Metadata persisted: {:>5} files | {:>5} edges | {:>5} nodes              │",
+            stats.files,
+            stored_edges,
+            total_nodes_extracted
         );
-        info!("├─────────────────────────────────────────────────────────────────┤");
-        info!("│ 🚀 CAPABILITIES UNLOCKED                                       │");
+        info!("├────────────────────────────────────────────────────────────────────────────┤");
+        info!("│ 🚀 CAPABILITIES UNLOCKED                                                  │");
         info!(
-            "│ ✅ Vector similarity search across {} embedded entities        │",
+            "│ ✅ Vector similarity search across {:>8} embedded entities                 │",
             stats.embeddings
         );
         info!(
-            "│ ✅ Graph traversal with {} real dependency relationships       │",
+            "│ ✅ Graph traversal with {:>8} real dependency relationships              │",
             stored_edges
         );
         info!("│ ✅ AI-powered semantic analysis with Qwen2.5-Coder integration │");
diff --git a/schema/codegraph.surql b/schema/codegraph.surql
@@ -27,18 +27,6 @@ DEFINE ANALYZER code_analyzer
 -- CORE TABLES
 -- =============================================================================
 
--- =============================================================================
--- ANALYZER
--- =============================================================================
-
-DEFINE ANALYZER code_analyzer
-  TOKENIZERS blank,class
-  FILTERS lowercase,snowball(english);
-
--- =============================================================================
--- CORE TABLES
--- =============================================================================
-
 -- -----------------------------------------------------------------------------
 -- TABLE: nodes
 -- -----------------------------------------------------------------------------
@@ -64,6 +52,8 @@ DEFINE FIELD IF NOT EXISTS embedding_1536  ON TABLE nodes TYPE option<array<floa
   ASSERT $value = NONE OR array::len($value) = 1536;
 DEFINE FIELD IF NOT EXISTS embedding_2048  ON TABLE nodes TYPE option<array<float>>
   ASSERT $value = NONE OR array::len($value) = 2048;
+DEFINE FIELD IF NOT EXISTS embedding_2560  ON TABLE nodes TYPE option<array<float>>
+  ASSERT $value = NONE OR array::len($value) = 2560;
 DEFINE FIELD IF NOT EXISTS embedding_3072  ON TABLE nodes TYPE option<array<float>>
   ASSERT $value = NONE OR array::len($value) = 3072;
 DEFINE FIELD IF NOT EXISTS embedding_4096  ON TABLE nodes TYPE option<array<float>>
@@ -99,6 +89,8 @@ DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_1536
     ON TABLE nodes FIELDS embedding_1536 HNSW DIMENSION 1536 DIST COSINE EFC 200 M 16;
 DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_2048
     ON TABLE nodes FIELDS embedding_2048 HNSW DIMENSION 2048 DIST COSINE EFC 200 M 16;
+DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_2560
+    ON TABLE nodes FIELDS embedding_2560 HNSW DIMENSION 2560 DIST COSINE EFC 200 M 16;
 DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_3072
     ON TABLE nodes FIELDS embedding_3072 HNSW DIMENSION 3072 DIST COSINE EFC 200 M 16;
 DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_4096
@@ -259,6 +251,8 @@ DEFINE FIELD IF NOT EXISTS embedding_1536  ON TABLE symbol_embeddings TYPE optio
   ASSERT $value = NONE OR array::len($value) = 1536;
 DEFINE FIELD IF NOT EXISTS embedding_2048  ON TABLE symbol_embeddings TYPE option<array<float>>
   ASSERT $value = NONE OR array::len($value) = 2048;
+DEFINE FIELD IF NOT EXISTS embedding_2560  ON TABLE symbol_embeddings TYPE option<array<float>>
+  ASSERT $value = NONE OR array::len($value) = 2560;
 DEFINE FIELD IF NOT EXISTS embedding_3072  ON TABLE symbol_embeddings TYPE option<array<float>>
   ASSERT $value = NONE OR array::len($value) = 3072;
 DEFINE FIELD IF NOT EXISTS embedding_4096  ON TABLE symbol_embeddings TYPE option<array<float>>
@@ -270,6 +264,7 @@ DEFINE FIELD IF NOT EXISTS embedding_768[*]   ON TABLE symbol_embeddings TYPE fl
 DEFINE FIELD IF NOT EXISTS embedding_1024[*]  ON TABLE symbol_embeddings TYPE float;
 DEFINE FIELD IF NOT EXISTS embedding_1536[*]  ON TABLE symbol_embeddings TYPE float;
 DEFINE FIELD IF NOT EXISTS embedding_2048[*]  ON TABLE symbol_embeddings TYPE float;
+DEFINE FIELD IF NOT EXISTS embedding_2560[*]  ON TABLE symbol_embeddings TYPE float;
 DEFINE FIELD IF NOT EXISTS embedding_3072[*]  ON TABLE symbol_embeddings TYPE float;
 DEFINE FIELD IF NOT EXISTS embedding_4096[*]  ON TABLE symbol_embeddings TYPE float;
 
@@ -299,6 +294,9 @@ DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_1536
 DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_2048
     ON TABLE symbol_embeddings FIELDS embedding_2048
     HNSW DIMENSION 2048 DIST COSINE EFC 200 M 16;
+DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_2560
+    ON TABLE symbol_embeddings FIELDS embedding_2560
+    HNSW DIMENSION 2560 DIST COSINE EFC 200 M 16;
 DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_3072
     ON TABLE symbol_embeddings FIELDS embedding_3072
     HNSW DIMENSION 3072 DIST COSINE EFC 200 M 16;