Skip to content

Commit 583fe8b

Browse files
committed
feat: update README and code to support additional embedding dimensions and improve model defaults
1 parent 9f4c00c commit 583fe8b

File tree

6 files changed

+95
-52
lines changed

6 files changed

+95
-52
lines changed

README.md

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ CodeGraph now writes Ollama/LM Studio embeddings directly into SurrealDB’s ded
2626
```bash
2727
export CODEGRAPH_EMBEDDING_PROVIDER=ollama
2828
export CODEGRAPH_EMBEDDING_MODEL=qwen3-embedding:0.6b # or all-mini-llm, qwen3-embedding:4b, embeddinggemma etc.
29-
export CODEGRAPH_EMBEDDING_DIMENSION=1024 # 384, 768, 1024, 2048, or 4096
29+
export CODEGRAPH_EMBEDDING_DIMENSION=1024 # 384, 768, 1024, 1536, 2048, 2560, 3072 or 4096 dimensions supported
3030

3131
# Optional local reranking (LM Studio exposes an OpenAI-compatible reranker endpoint)
3232
export CODEGRAPH_RERANKING_PROVIDER=lmstudio
@@ -170,7 +170,7 @@ Pick the setup that matches your needs:
170170

171171
**Providers:**
172172
- **Embeddings:** Jina (You get 10 million tokens for free when you just create an account!)
173-
- **LLM:** Anthropic Claude or OpenAI GPT-5-*
173+
- **LLM:** Anthropic Claude or OpenAI GPT-5.1-*
174174
- **Backend**: SurrealDB graph database (You get a free cloud instance up-to 1gb! Or run it completely locally!)
175175

176176
**Pros:** ✅ Best quality, ✅ Fast, ✅ 1M context (sonnet[1m])
@@ -384,6 +384,7 @@ dimension = 2048
384384
enabled = true
385385
provider = "openai"
386386
model = "gpt-5-codex-mini"
387+
context_window=200000
387388
openai_api_key = "sk-..."
388389
max_completion_token = 128000
389390
reasoning_effort = "medium" # reasoning models: "minimal", "medium", "high"
@@ -403,6 +404,8 @@ jina_reranking_model = "jina-reranker-v3"
403404
enabled = true
404405
provider = "anthropic"
405406
model = "claude-haiku"
407+
context_window = 200000
408+
max_completion_tokens= 25000
406409
anthropic_api_key = "sk-ant-..."
407410
```
408411

@@ -412,7 +415,7 @@ anthropic_api_key = "sk-ant-..."
412415
provider = "openai" # or "jina"
413416
model = "text-embedding-3-small"
414417
openai_api_key = "sk-..."
415-
dimension = 2048
418+
dimension = 1536
416419

417420
[llm]
418421
enabled = true
@@ -472,6 +475,7 @@ dimension = 384
472475
enabled = true
473476
provider = "anthropic" # Best quality for analysis
474477
model = "sonnet[1m]"
478+
context_window = 1000000
475479
anthropic_api_key = "sk-ant-..."
476480
```
477481

@@ -522,7 +526,7 @@ model = "haiku"
522526
anthropic_api_key = "sk-ant-..."
523527
context_window = 200000
524528
temperature = 0.1
525-
max_completion_token = 4096
529+
max_completion_token = 25000
526530

527531
[performance]
528532
num_threads = 0 # 0 = auto-detect
@@ -663,8 +667,8 @@ flowchart TD
663667
664668
D -->|< 50K tokens| E1[Small Tier<br/>TERSE prompts<br/>5 max steps<br/>2,048 tokens]
665669
D -->|50K-150K tokens| E2[Medium Tier<br/>BALANCED prompts<br/>10 max steps<br/>4,096 tokens]
666-
D -->|150K-500K tokens| E3[Large Tier<br/>DETAILED prompts<br/>15 max steps<br/>8,192 tokens]
667-
D -->|> 500K tokens| E4[Massive Tier<br/>EXPLORATORY prompts<br/>20 max steps<br/>16,384 tokens]
670+
D -->|150K-400K tokens| E3[Large Tier<br/>DETAILED prompts<br/>15 max steps<br/>8,192 tokens]
671+
D -->|> 400K tokens| E4[Massive Tier<br/>EXPLORATORY prompts<br/>20 max steps<br/>16,384 tokens]
668672
669673
E1 & E2 & E3 & E4 --> F[Load Tier-Specific<br/>System Prompt]
670674
@@ -712,10 +716,10 @@ flowchart TD
712716
**Key Components:**
713717

714718
1. **Tier Detection**: Automatically adapts prompt complexity based on LLM's context window
715-
- Small (<50K): Fast, terse responses for limited context models f.ex. local
719+
- Small (<50K): Fast, terse responses for limited context models f.ex. local gemma3 etc.
716720
- Medium (50K-150K): Balanced analysis for Claude Haiku, gpt-5.1-codex-mini
717-
- Large (150K-400K): Detailed exploration for Sonnet, Opus, gpt-5.1
718-
- Massive (>400K): Comprehensive deep-dives for grok-4-fast, gemini-2.5-pro, Sonnet[1m]
721+
- Large (150K-400K): Detailed exploration for Sonnet, Opus, gpt-5.1, qwen3:4b
722+
- Massive (>400K): Comprehensive deep-dives for grok-4-fast, gemini-3.0-pro, Sonnet[1m]
719723

720724
2. **Multi-Step Reasoning**: ReAct pattern with tier-specific limits
721725
- Each step can call internal graph analysis tools

crates/codegraph-ai/src/llm_factory.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ impl LLMProviderFactory {
125125
model: config
126126
.model
127127
.clone()
128-
.unwrap_or_else(|| "claude-3-5-sonnet-20241022".to_string()),
128+
.unwrap_or_else(|| "claude".to_string()),
129129
context_window: config.context_window,
130130
timeout_secs: config.timeout_secs,
131131
max_retries: 3,
@@ -151,7 +151,7 @@ impl LLMProviderFactory {
151151
let openai_config = OpenAIConfig {
152152
api_key,
153153
base_url: "https://api.openai.com/v1".to_string(),
154-
model: config.model.clone().unwrap_or_else(|| "gpt-4o".to_string()),
154+
model: config.model.clone().unwrap_or_else(|| "gpt-5.1-codex".to_string()),
155155
context_window: config.context_window,
156156
timeout_secs: config.timeout_secs,
157157
max_retries: 3,
@@ -261,9 +261,9 @@ mod tests {
261261
provider: "ollama".to_string(),
262262
model: Some("qwen2.5-coder:14b".to_string()),
263263
ollama_url: "http://localhost:11434".to_string(),
264-
context_window: 128_000,
264+
context_window: config.context,
265265
temperature: 0.1,
266-
max_tokens: 4096,
266+
max_tokens: config.max_tokens,
267267
timeout_secs: 120,
268268
..Default::default()
269269
};

crates/codegraph-ai/src/openai_compatible_provider.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ impl Default for OpenAICompatibleConfig {
3131
Self {
3232
base_url: "http://localhost:1234/v1".to_string(),
3333
model: "local-model".to_string(),
34-
context_window: 256_000,
34+
context_window: config.context_window,
3535
timeout_secs: 120,
3636
max_retries: 3,
3737
api_key: None,
@@ -47,7 +47,7 @@ impl OpenAICompatibleConfig {
4747
Self {
4848
base_url: "http://localhost:1234/v1".to_string(),
4949
model,
50-
context_window: 256_000,
50+
context_window: config.context_window,
5151
provider_name: "lmstudio".to_string(),
5252
use_responses_api: false, // LM Studio doesn't support Responses API
5353
..Default::default()
@@ -59,7 +59,7 @@ impl OpenAICompatibleConfig {
5959
Self {
6060
base_url: "http://localhost:11434/v1".to_string(),
6161
model,
62-
context_window: 256_000,
62+
context_window: config.context_window,
6363
provider_name: "ollama".to_string(),
6464
use_responses_api: false, // Ollama doesn't support Responses API
6565
..Default::default()

crates/codegraph-ai/src/openai_llm_provider.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
66
use std::time::{Duration, Instant};
77

88
const OPENAI_API_BASE: &str = "https://api.openai.com/v1";
9-
const DEFAULT_MODEL: &str = "gpt-4o";
9+
const DEFAULT_MODEL: &str = "gpt-5.1-codex";
1010

1111
/// Configuration for OpenAI provider
1212
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -15,7 +15,7 @@ pub struct OpenAIConfig {
1515
pub api_key: String,
1616
/// Base URL for API (default: https://api.openai.com/v1)
1717
pub base_url: String,
18-
/// Model to use (e.g., "gpt-4o", "o3-mini", "o1")
18+
/// Model to use (e.g., "gpt-5.1, gpt-5.1-codex, gpt-5.1-codex-mini)
1919
pub model: String,
2020
/// Maximum context window
2121
pub context_window: usize,
@@ -33,7 +33,7 @@ impl Default for OpenAIConfig {
3333
api_key: std::env::var("OPENAI_API_KEY").unwrap_or_default(),
3434
base_url: OPENAI_API_BASE.to_string(),
3535
model: DEFAULT_MODEL.to_string(),
36-
context_window: 400000,
36+
context_window: config.context_window,
3737
timeout_secs: 120,
3838
max_retries: 3,
3939
organization: std::env::var("OPENAI_ORG_ID").ok(),
@@ -212,7 +212,7 @@ impl LLMProvider for OpenAIProvider {
212212
let response = self.send_request(messages, config).await?;
213213

214214
// Extract text from output array
215-
// OpenAI GPT-5 returns: output[{type: "message", content: [{type: "output_text", text: "..."}]}]
215+
// OpenAI GPT-5.1 returns: output[{type: "message", content: [{type: "output_text", text: "..."}]}]
216216
let content = response
217217
.output
218218
.iter()
@@ -427,7 +427,7 @@ mod tests {
427427

428428
#[test]
429429
fn test_reasoning_model_detection() {
430-
let models = vec!["gpt-5"];
430+
let models = vec!["gpt-5.1"];
431431
for model in models {
432432
let config = OpenAIConfig {
433433
api_key: "test".to_string(),

crates/codegraph-mcp/src/indexer.rs

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,43 +1249,84 @@ impl ProjectIndexer {
12491249
self.flush_surreal_writer().await?;
12501250

12511251
// COMPREHENSIVE INDEXING COMPLETION SUMMARY
1252+
let avg_nodes_per_file = if stats.files > 0 {
1253+
total_nodes_extracted as f64 / stats.files as f64
1254+
} else {
1255+
0.0
1256+
};
1257+
let avg_edges_per_file = if stats.files > 0 {
1258+
total_edges_extracted as f64 / stats.files as f64
1259+
} else {
1260+
0.0
1261+
};
1262+
let avg_embeddings_per_node = if total_nodes_extracted > 0 {
1263+
stats.embeddings as f64 / total_nodes_extracted as f64
1264+
} else {
1265+
0.0
1266+
};
1267+
12521268
info!("🎉 INDEXING COMPLETE - REVOLUTIONARY AI DEVELOPMENT PLATFORM READY!");
1253-
info!("┌─────────────────────────────────────────────────────────────────┐");
1254-
info!("│ 📊 COMPREHENSIVE INDEXING STATISTICS │");
1255-
info!("├─────────────────────────────────────────────────────────────────┤");
1269+
info!("┌────────────────────────────────────────────────────────────────────────────┐");
1270+
info!("│ 📊 COMPREHENSIVE INDEXING STATISTICS │");
1271+
info!("├────────────────────────────────────────────────────────────────────────────┤");
12561272
info!(
1257-
"│ 📄 Files processed: {} ({} languages supported) │",
1273+
"│ 📂 Files scanned: {:>5} total | {:>5} parsed | {:>5} skipped │",
1274+
pstats.total_files,
12581275
stats.files,
1259-
file_config.languages.len()
1276+
stats.skipped
12601277
);
12611278
info!(
1262-
"│ 📝 Lines analyzed: {} (TreeSitter AST parsing) │",
1263-
stats.lines
1279+
"│ ✅ Parser success: {:>5.1}% ({} / {} files) │",
1280+
success_rate,
1281+
pstats.parsed_files,
1282+
pstats.total_files
12641283
);
12651284
info!(
1266-
"│ 🌳 Semantic nodes: {} (functions: {}, structs: {}, traits: {}) │",
1267-
total_nodes_extracted, stats.functions, stats.structs, stats.traits
1285+
"│ 🗣️ Languages targeted: {:>3} | Batch (embed) {:>3} | Concurrency {:>3} │",
1286+
file_config.languages.len(),
1287+
batch,
1288+
self.config.max_concurrent
12681289
);
12691290
info!(
1270-
"│ 🔗 Code relationships: {} extracted (calls, imports, deps) │",
1271-
total_edges_extracted
1291+
"│ 📝 Lines analyzed: {:>10} | Avg nodes/file {:>5.1} | Avg deps/file {:>5.1} │",
1292+
stats.lines,
1293+
avg_nodes_per_file,
1294+
avg_edges_per_file
12721295
);
12731296
info!(
1274-
"│ 💾 Vector embeddings: {} ({}-dim {}) │",
1275-
stats.embeddings, self.vector_dim, provider
1297+
"│ 🌳 Semantic nodes: {:>8} | funcs {:>6} | structs {:>5} | traits {:>5} │",
1298+
total_nodes_extracted,
1299+
stats.functions,
1300+
stats.structs,
1301+
stats.traits
12761302
);
12771303
info!(
1278-
"│ 🎯 Dependency resolution: {:.1}% success ({}/{} edges stored) │",
1279-
resolution_rate, stored_edges, edge_count
1304+
"│ 🔗 Dependencies: {:>8} extracted | {:>8} stored (resolved {:.1}%) │",
1305+
total_edges_extracted,
1306+
stored_edges,
1307+
resolution_rate
1308+
);
1309+
info!(
1310+
"│ 💾 Vector embeddings: {:>8} ({:>4}-dim {}, {:.1} per node) │",
1311+
stats.embeddings,
1312+
self.vector_dim,
1313+
provider,
1314+
avg_embeddings_per_node
1315+
);
1316+
info!(
1317+
"│ 📦 Metadata persisted: {:>5} files | {:>5} edges | {:>5} nodes │",
1318+
stats.files,
1319+
stored_edges,
1320+
total_nodes_extracted
12801321
);
1281-
info!("├─────────────────────────────────────────────────────────────────┤");
1282-
info!("│ 🚀 CAPABILITIES UNLOCKED │");
1322+
info!("├────────────────────────────────────────────────────────────────────────────┤");
1323+
info!("│ 🚀 CAPABILITIES UNLOCKED │");
12831324
info!(
1284-
"│ ✅ Vector similarity search across {} embedded entities │",
1325+
"│ ✅ Vector similarity search across {:>8} embedded entities │",
12851326
stats.embeddings
12861327
);
12871328
info!(
1288-
"│ ✅ Graph traversal with {} real dependency relationships │",
1329+
"│ ✅ Graph traversal with {:>8} real dependency relationships │",
12891330
stored_edges
12901331
);
12911332
info!("│ ✅ AI-powered semantic analysis with Qwen2.5-Coder integration │");

schema/codegraph.surql

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,6 @@ DEFINE ANALYZER code_analyzer
2727
-- CORE TABLES
2828
-- =============================================================================
2929

30-
-- =============================================================================
31-
-- ANALYZER
32-
-- =============================================================================
33-
34-
DEFINE ANALYZER code_analyzer
35-
TOKENIZERS blank,class
36-
FILTERS lowercase,snowball(english);
37-
38-
-- =============================================================================
39-
-- CORE TABLES
40-
-- =============================================================================
41-
4230
-- -----------------------------------------------------------------------------
4331
-- TABLE: nodes
4432
-- -----------------------------------------------------------------------------
@@ -64,6 +52,8 @@ DEFINE FIELD IF NOT EXISTS embedding_1536 ON TABLE nodes TYPE option<array<floa
6452
ASSERT $value = NONE OR array::len($value) = 1536;
6553
DEFINE FIELD IF NOT EXISTS embedding_2048 ON TABLE nodes TYPE option<array<float>>
6654
ASSERT $value = NONE OR array::len($value) = 2048;
55+
DEFINE FIELD IF NOT EXISTS embedding_2560 ON TABLE nodes TYPE option<array<float>>
56+
ASSERT $value = NONE OR array::len($value) = 2560;
6757
DEFINE FIELD IF NOT EXISTS embedding_3072 ON TABLE nodes TYPE option<array<float>>
6858
ASSERT $value = NONE OR array::len($value) = 3072;
6959
DEFINE FIELD IF NOT EXISTS embedding_4096 ON TABLE nodes TYPE option<array<float>>
@@ -99,6 +89,8 @@ DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_1536
9989
ON TABLE nodes FIELDS embedding_1536 HNSW DIMENSION 1536 DIST COSINE EFC 200 M 16;
10090
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_2048
10191
ON TABLE nodes FIELDS embedding_2048 HNSW DIMENSION 2048 DIST COSINE EFC 200 M 16;
92+
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_2560
93+
ON TABLE nodes FIELDS embedding_2560 HNSW DIMENSION 2560 DIST COSINE EFC 200 M 16;
10294
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_3072
10395
ON TABLE nodes FIELDS embedding_3072 HNSW DIMENSION 3072 DIST COSINE EFC 200 M 16;
10496
DEFINE INDEX IF NOT EXISTS idx_nodes_embedding_4096
@@ -259,6 +251,8 @@ DEFINE FIELD IF NOT EXISTS embedding_1536 ON TABLE symbol_embeddings TYPE optio
259251
ASSERT $value = NONE OR array::len($value) = 1536;
260252
DEFINE FIELD IF NOT EXISTS embedding_2048 ON TABLE symbol_embeddings TYPE option<array<float>>
261253
ASSERT $value = NONE OR array::len($value) = 2048;
254+
DEFINE FIELD IF NOT EXISTS embedding_2560 ON TABLE symbol_embeddings TYPE option<array<float>>
255+
ASSERT $value = NONE OR array::len($value) = 2560;
262256
DEFINE FIELD IF NOT EXISTS embedding_3072 ON TABLE symbol_embeddings TYPE option<array<float>>
263257
ASSERT $value = NONE OR array::len($value) = 3072;
264258
DEFINE FIELD IF NOT EXISTS embedding_4096 ON TABLE symbol_embeddings TYPE option<array<float>>
@@ -270,6 +264,7 @@ DEFINE FIELD IF NOT EXISTS embedding_768[*] ON TABLE symbol_embeddings TYPE fl
270264
DEFINE FIELD IF NOT EXISTS embedding_1024[*] ON TABLE symbol_embeddings TYPE float;
271265
DEFINE FIELD IF NOT EXISTS embedding_1536[*] ON TABLE symbol_embeddings TYPE float;
272266
DEFINE FIELD IF NOT EXISTS embedding_2048[*] ON TABLE symbol_embeddings TYPE float;
267+
DEFINE FIELD IF NOT EXISTS embedding_2560[*] ON TABLE symbol_embeddings TYPE float;
273268
DEFINE FIELD IF NOT EXISTS embedding_3072[*] ON TABLE symbol_embeddings TYPE float;
274269
DEFINE FIELD IF NOT EXISTS embedding_4096[*] ON TABLE symbol_embeddings TYPE float;
275270

@@ -299,6 +294,9 @@ DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_1536
299294
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_2048
300295
ON TABLE symbol_embeddings FIELDS embedding_2048
301296
HNSW DIMENSION 2048 DIST COSINE EFC 200 M 16;
297+
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_2560
298+
ON TABLE symbol_embeddings FIELDS embedding_2560
299+
HNSW DIMENSION 2560 DIST COSINE EFC 200 M 16;
302300
DEFINE INDEX IF NOT EXISTS idx_symbol_embeddings_vector_3072
303301
ON TABLE symbol_embeddings FIELDS embedding_3072
304302
HNSW DIMENSION 3072 DIST COSINE EFC 200 M 16;

0 commit comments

Comments
 (0)