@@ -314,9 +314,62 @@ function parseMarkdownContent(filePath) {
314314}
315315
316316/**
317- * Generate podcast dialog prompt optimized for Claude Haiku 4.5
317+ * Calculate target token count based on source material complexity
318318 */
319- function buildDialogPrompt ( content , fileName , outputPath ) {
319+ function calculateTargetTokens ( sourceContent ) {
320+ const MIN_TOKENS = 3000 ;
321+ const MAX_TOKENS = 15000 ;
322+
323+ // Estimate source token count (rough: ~4 chars per token)
324+ const sourceTokenCount = Math . ceil ( sourceContent . length / 4 ) ;
325+
326+ // Base scaling: 0.6x source tokens (allows expansion for dialogue format)
327+ let target = Math . floor ( sourceTokenCount * 0.6 ) ;
328+
329+ // Complexity multipliers - count structural elements
330+ const hasCodeBlocks = ( sourceContent . match ( / ` ` ` / g) || [ ] ) . length / 2 ;
331+ const hasTables = ( sourceContent . match ( / ^ \| / gm) || [ ] ) . length ;
332+ const hasDeepDives = ( sourceContent . match ( / < d e t a i l s > / g) || [ ] ) . length ;
333+ const hasPedagogicalNotes = ( sourceContent . match ( / : : : ( t i p | w a r n i n g | i n f o | n o t e ) / gi) || [ ] ) . length ;
334+
335+ // Add tokens for complex content that needs narration
336+ target += hasCodeBlocks * 200 ; // Each code block needs explanation
337+ target += hasTables * 150 ; // Tables need verbal description
338+ target += hasDeepDives * 500 ; // Deep dives = high information density
339+ target += hasPedagogicalNotes * 100 ; // Pedagogical notes add context
340+
341+ // Clamp to reasonable bounds
342+ const finalTarget = Math . max ( MIN_TOKENS , Math . min ( MAX_TOKENS , target ) ) ;
343+
344+ console . log ( ` 📊 Source complexity analysis:` ) ;
345+ console . log ( ` - Estimated source tokens: ${ sourceTokenCount } ` ) ;
346+ console . log ( ` - Code blocks: ${ hasCodeBlocks } ` ) ;
347+ console . log ( ` - Tables: ${ hasTables } ` ) ;
348+ console . log ( ` - Deep dives: ${ hasDeepDives } ` ) ;
349+ console . log ( ` - Pedagogical notes: ${ hasPedagogicalNotes } ` ) ;
350+ console . log ( ` - Target podcast tokens: ${ finalTarget } ` ) ;
351+
352+ return finalTarget ;
353+ }
354+
355+ /**
356+ * Select appropriate model based on content complexity
357+ */
358+ function selectModel ( targetTokenCount , sourceTokenCount ) {
359+ // Use Sonnet for complex lessons requiring depth
360+ if ( targetTokenCount > 8000 || sourceTokenCount > 6000 ) {
361+ console . log ( ` 🤖 Selected model: Sonnet (high complexity)` ) ;
362+ return 'sonnet' ;
363+ }
364+ // Haiku for shorter, simpler content
365+ console . log ( ` 🤖 Selected model: Haiku (standard complexity)` ) ;
366+ return 'haiku' ;
367+ }
368+
369+ /**
370+ * Generate podcast dialog prompt optimized for Claude Haiku 4.5 or Sonnet
371+ */
372+ function buildDialogPrompt ( content , fileName , outputPath , targetTokens , sourceTokens ) {
320373 // Special handling for intro.md - add brief meta-acknowledgement
321374 const isIntro = fileName === 'intro' ;
322375 const metaCommentary = isIntro ? `
@@ -412,6 +465,40 @@ Alex: "Exactly. Now look at the effective version: 'Write a TypeScript function
412465
413466Sam: "That's night and day. The second version gives the AI everything it needs - language, standard, edge cases, return type."
414467
468+ CRITICAL: PRESERVE TECHNICAL SPECIFICITY
469+
470+ The source material contains actionable technical details that MUST be preserved in the podcast:
471+
472+ ✓ PRESERVE: Exact numbers (token counts, LOC thresholds, dimensions, percentages, ratios)
473+ Example: "60-120K tokens reliable attention" NOT "somewhat less than advertised"
474+ Example: "<10K LOC use agentic search, 10-100K use semantic search" NOT "different tools for different sizes"
475+
476+ ✓ PRESERVE: Tool and product names with brief context
477+ Example: "ChunkHound for structured multi-hop traversal" NOT just "a tool"
478+ Example: "ChromaDB, pgvector, or Qdrant vector databases" NOT just "vector databases"
479+
480+ ✓ PRESERVE: Decision matrices and selection criteria
481+ Example: "Under 10K lines use X, 10-100K lines use Y, above 100K use Z with reason" NOT just "pick the right tool"
482+
483+ ✓ PRESERVE: Technical architecture details
484+ Example: "768-1536 dimensional vectors with cosine similarity" NOT just "high-dimensional vectors"
485+
486+ ✓ PRESERVE: Concrete examples with specific numbers
487+ Example: "10 chunks at 15K tokens plus 25K for files equals 40K" NOT just "uses a lot of tokens"
488+
489+ ✗ DO NOT: Replace specific numbers with vague descriptors ("a lot", "many", "significant")
490+ ✗ DO NOT: Skip tool names - always mention them with 1-sentence context of what they do
491+ ✗ DO NOT: Simplify decision criteria into generic advice
492+ ✗ DO NOT: Omit architectural details that explain how things work
493+
494+ EXAMPLE - BAD (too vague):
495+ "You need different approaches for different codebase sizes to get good results."
496+
497+ EXAMPLE - GOOD (preserves specifics):
498+ "For codebases under 10,000 lines, agentic search with Grep and Read works well. Between 10,000 and 100,000 lines,
499+ switch to semantic search - tools like ChunkHound or Claude Context via MCP servers. Above 100,000 lines, you need
500+ ChunkHound's structured multi-hop traversal because autonomous agents start missing connections."
501+
415502CRITICAL: CONTENT DEDUPLICATION REQUIREMENTS
416503
417504The source material uses pedagogical reinforcement patterns designed for written learning:
@@ -484,9 +571,14 @@ Alex: [natural dialog here]
484571</podcast_dialog>${ metaCommentary }
485572
486573LENGTH CONSTRAINT:
487- Target 6,000-7,500 tokens for the complete dialog. This ensures it fits within TTS API limits while maintaining quality.
488- Given the deduplication requirements above, your podcast will likely be SHORTER than the source material - this is expected and desirable.
489- Focus on depth and clarity over comprehensive coverage. Prioritize depth over breadth.
574+ Target ${ targetTokens } -${ targetTokens + 1500 } tokens for the complete dialog (dynamically calculated based on source complexity).
575+ This lesson has ${ sourceTokens } estimated source tokens with specific complexity factors considered.
576+
577+ IMPORTANT: Depth is prioritized over compression for this content.
578+ - Preserve ALL technical specifics, numbers, tool names, and decision criteria
579+ - The token budget is adaptive - complex lessons get more space to preserve detail
580+ - Deduplication is for removing redundancy, NOT for cutting essential technical information
581+ - Focus on making content clear and complete, not artificially short
490582
491583TECHNICAL CONTENT TITLE: ${ fileName }
492584
@@ -502,17 +594,17 @@ Just write the raw dialog to the file now.`;
502594/**
503595 * Call Claude Code CLI in headless mode to generate dialog
504596 */
505- async function generateDialogWithClaude ( prompt , outputPath ) {
597+ async function generateDialogWithClaude ( prompt , outputPath , model = 'haiku' ) {
506598 return new Promise ( ( resolve , reject ) => {
507- console . log ( ` 🤖 Calling Claude Code CLI (Haiku 4.5 )...` ) ;
599+ console . log ( ` 🤖 Calling Claude Code CLI (${ model } )...` ) ;
508600
509601 // Ensure output directory exists before Claude tries to write
510602 mkdirSync ( dirname ( outputPath ) , { recursive : true } ) ;
511603
512604 // Spawn claude process with headless mode
513605 const claude = spawn ( 'claude' , [
514606 '-p' , // Headless mode (non-interactive)
515- '--model' , 'haiku' , // Use Haiku 4.5
607+ '--model' , model , // Use specified model (haiku or sonnet)
516608 '--allowedTools' , 'Edit' , 'Write' // Allow file editing and writing only
517609 ] ) ;
518610
@@ -587,6 +679,50 @@ async function generateDialogWithClaude(prompt, outputPath) {
587679 } ) ;
588680}
589681
682+ /**
683+ * Validate technical depth and information preservation
684+ */
685+ function validateTechnicalDepth ( dialog , sourceContent ) {
686+ const warnings = [ ] ;
687+
688+ // Extract numbers from source and dialog (including LOC like "10K", percentages, dimensions)
689+ const sourceNumbers = sourceContent . match ( / \b \d + [ K M ] ? (?: % | K | M | , \d { 3 } ) * \b / g) || [ ] ;
690+ const dialogNumbers = dialog . match ( / \b \d + [ K M ] ? (?: % | K | M | , \d { 3 } ) * \b / g) || [ ] ;
691+
692+ // Should preserve at least 40% of specific numbers
693+ if ( dialogNumbers . length < sourceNumbers . length * 0.4 ) {
694+ warnings . push (
695+ `⚠️ Low number preservation: ${ dialogNumbers . length } /${ sourceNumbers . length } numbers mentioned ` +
696+ `(${ ( ( dialogNumbers . length / sourceNumbers . length ) * 100 ) . toFixed ( 0 ) } %)`
697+ ) ;
698+ }
699+
700+ // Extract tool/product names (capitalized technical terms)
701+ const toolPattern = / \b (?: [ A - Z ] [ a - z ] + (?: [ A - Z ] [ a - z ] + ) * (?: D B | R A G | S e a r c h | A g e n t | H o u n d | S e e k | C o n t e x t | M C P | S e r e n a | P e r p l e x i t y | C h u n k H o u n d | A r g u S e e k ) | C h u n k H o u n d | A r g u S e e k | C h r o m a D B | p g v e c t o r | Q d r a n t ) \b / g;
702+ const sourceTools = new Set ( sourceContent . match ( toolPattern ) || [ ] ) ;
703+ const dialogTools = new Set ( dialog . match ( toolPattern ) || [ ] ) ;
704+
705+ const missingTools = [ ...sourceTools ] . filter ( t => ! dialogTools . has ( t ) ) ;
706+ if ( missingTools . length > sourceTools . size * 0.3 ) {
707+ warnings . push (
708+ `⚠️ Missing important tools: ${ missingTools . slice ( 0 , 5 ) . join ( ', ' ) } ` +
709+ `${ missingTools . length > 5 ? ` (+ ${ missingTools . length - 5 } more)` : '' } `
710+ ) ;
711+ }
712+
713+ // Check for decision matrices / thresholds (lines with multiple | symbols)
714+ const sourceTables = ( sourceContent . match ( / ^ \| .* \| .* \| / gm) || [ ] ) . length ;
715+ if ( sourceTables > 0 ) {
716+ // Tables should be mentioned or narrated somehow
717+ const tableKeywords = / ( m a t r i x | t a b l e | c o m p a r i s o n | t h r e s h o l d | s c a l e | t i e r ) / gi;
718+ if ( ! tableKeywords . test ( dialog ) ) {
719+ warnings . push ( `⚠️ Source contains ${ sourceTables } table rows but podcast doesn't narrate them` ) ;
720+ }
721+ }
722+
723+ return warnings ;
724+ }
725+
590726/**
591727 * Validate dialog for repetition patterns
592728 */
@@ -1166,14 +1302,24 @@ async function generateScript(filePath, scriptManifest, config) {
11661302 console . log ( `\n📄 Generating script: ${ relativePath } ` ) ;
11671303
11681304 try {
1169- // Parse content
1305+ // Read raw content first for complexity analysis
1306+ const rawContent = readFileSync ( filePath , 'utf-8' ) ;
1307+
1308+ // Parse content for podcast generation
11701309 const content = parseMarkdownContent ( filePath ) ;
11711310
11721311 if ( content . length < 100 ) {
11731312 console . log ( ` ⚠️ Skipping - content too short` ) ;
11741313 return null ;
11751314 }
11761315
1316+ // Calculate dynamic token budget based on RAW content complexity (before parsing)
1317+ const targetTokens = calculateTargetTokens ( rawContent ) ;
1318+ const sourceTokens = Math . ceil ( rawContent . length / 4 ) ;
1319+
1320+ // Select appropriate model
1321+ const model = selectModel ( targetTokens , sourceTokens ) ;
1322+
11771323 // Determine output path
11781324 const outputFileName = `${ fileName } .md` ;
11791325 const outputPath = join ( SCRIPT_OUTPUT_DIR , dirname ( relativePath ) , outputFileName ) ;
@@ -1184,8 +1330,8 @@ async function generateScript(filePath, scriptManifest, config) {
11841330 console . log ( ` 🗑️ Deleted existing file for fresh generation` ) ;
11851331 }
11861332
1187- // Build prompt
1188- const prompt = buildDialogPrompt ( content , fileName , outputPath ) ;
1333+ // Build prompt with dynamic parameters
1334+ const prompt = buildDialogPrompt ( content , fileName , outputPath , targetTokens , sourceTokens ) ;
11891335
11901336 // Debug mode: save prompt
11911337 if ( config . debug ) {
@@ -1194,14 +1340,24 @@ async function generateScript(filePath, scriptManifest, config) {
11941340 console . log ( ` 🔍 Debug prompt saved: ${ debugPath } ` ) ;
11951341 }
11961342
1197- // Generate dialog using Claude
1198- const dialog = await generateDialogWithClaude ( prompt , outputPath ) ;
1343+ // Generate dialog using Claude with selected model
1344+ const dialog = await generateDialogWithClaude ( prompt , outputPath , model ) ;
1345+
1346+ // Validate technical depth and information preservation (against raw content)
1347+ console . log ( ` 🔍 Validating technical depth...` ) ;
1348+ const technicalWarnings = validateTechnicalDepth ( dialog , rawContent ) ;
1349+ if ( technicalWarnings . length > 0 ) {
1350+ console . log ( ` ⚠️ Technical depth warnings:` ) ;
1351+ technicalWarnings . forEach ( w => console . log ( ` ${ w } ` ) ) ;
1352+ } else {
1353+ console . log ( ` ✅ Technical depth validation passed` ) ;
1354+ }
11991355
1200- // Validate dialog quality
1201- const warnings = validateDialogQuality ( dialog ) ;
1202- if ( warnings . length > 0 ) {
1356+ // Validate dialog quality (repetition)
1357+ const qualityWarnings = validateDialogQuality ( dialog ) ;
1358+ if ( qualityWarnings . length > 0 ) {
12031359 console . log ( ` ⚠️ Quality warnings detected:` ) ;
1204- warnings . forEach ( w => console . log ( ` - ${ w } ` ) ) ;
1360+ qualityWarnings . forEach ( w => console . log ( ` - ${ w } ` ) ) ;
12051361 console . log ( ` 💡 Consider regenerating if repetition is significant` ) ;
12061362 } else {
12071363 console . log ( ` ✅ Quality validation passed - no repetition detected` ) ;
0 commit comments