Skip to content

Commit 5337153

Browse files
committed
Updated podcasts
1 parent 668d3c2 commit 5337153

24 files changed

+943
-629
lines changed

scripts/generate-podcast.js

Lines changed: 173 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,62 @@ function parseMarkdownContent(filePath) {
314314
}
315315

316316
/**
317-
* Generate podcast dialog prompt optimized for Claude Haiku 4.5
317+
* Calculate target token count based on source material complexity
318318
*/
319-
function buildDialogPrompt(content, fileName, outputPath) {
319+
function calculateTargetTokens(sourceContent) {
320+
const MIN_TOKENS = 3000;
321+
const MAX_TOKENS = 15000;
322+
323+
// Estimate source token count (rough: ~4 chars per token)
324+
const sourceTokenCount = Math.ceil(sourceContent.length / 4);
325+
326+
// Base scaling: 0.6x source tokens (allows expansion for dialogue format)
327+
let target = Math.floor(sourceTokenCount * 0.6);
328+
329+
// Complexity multipliers - count structural elements
330+
const hasCodeBlocks = (sourceContent.match(/```/g) || []).length / 2;
331+
const hasTables = (sourceContent.match(/^\|/gm) || []).length;
332+
const hasDeepDives = (sourceContent.match(/<details>/g) || []).length;
333+
const hasPedagogicalNotes = (sourceContent.match(/:::(tip|warning|info|note)/gi) || []).length;
334+
335+
// Add tokens for complex content that needs narration
336+
target += hasCodeBlocks * 200; // Each code block needs explanation
337+
target += hasTables * 150; // Tables need verbal description
338+
target += hasDeepDives * 500; // Deep dives = high information density
339+
target += hasPedagogicalNotes * 100; // Pedagogical notes add context
340+
341+
// Clamp to reasonable bounds
342+
const finalTarget = Math.max(MIN_TOKENS, Math.min(MAX_TOKENS, target));
343+
344+
console.log(` 📊 Source complexity analysis:`);
345+
console.log(` - Estimated source tokens: ${sourceTokenCount}`);
346+
console.log(` - Code blocks: ${hasCodeBlocks}`);
347+
console.log(` - Tables: ${hasTables}`);
348+
console.log(` - Deep dives: ${hasDeepDives}`);
349+
console.log(` - Pedagogical notes: ${hasPedagogicalNotes}`);
350+
console.log(` - Target podcast tokens: ${finalTarget}`);
351+
352+
return finalTarget;
353+
}
354+
355+
/**
356+
* Select appropriate model based on content complexity
357+
*/
358+
function selectModel(targetTokenCount, sourceTokenCount) {
359+
// Use Sonnet for complex lessons requiring depth
360+
if (targetTokenCount > 8000 || sourceTokenCount > 6000) {
361+
console.log(` 🤖 Selected model: Sonnet (high complexity)`);
362+
return 'sonnet';
363+
}
364+
// Haiku for shorter, simpler content
365+
console.log(` 🤖 Selected model: Haiku (standard complexity)`);
366+
return 'haiku';
367+
}
368+
369+
/**
370+
* Generate podcast dialog prompt optimized for Claude Haiku 4.5 or Sonnet
371+
*/
372+
function buildDialogPrompt(content, fileName, outputPath, targetTokens, sourceTokens) {
320373
// Special handling for intro.md - add brief meta-acknowledgement
321374
const isIntro = fileName === 'intro';
322375
const metaCommentary = isIntro ? `
@@ -412,6 +465,40 @@ Alex: "Exactly. Now look at the effective version: 'Write a TypeScript function
412465
413466
Sam: "That's night and day. The second version gives the AI everything it needs - language, standard, edge cases, return type."
414467
468+
CRITICAL: PRESERVE TECHNICAL SPECIFICITY
469+
470+
The source material contains actionable technical details that MUST be preserved in the podcast:
471+
472+
✓ PRESERVE: Exact numbers (token counts, LOC thresholds, dimensions, percentages, ratios)
473+
Example: "60-120K tokens reliable attention" NOT "somewhat less than advertised"
474+
Example: "<10K LOC use agentic search, 10-100K use semantic search" NOT "different tools for different sizes"
475+
476+
✓ PRESERVE: Tool and product names with brief context
477+
Example: "ChunkHound for structured multi-hop traversal" NOT just "a tool"
478+
Example: "ChromaDB, pgvector, or Qdrant vector databases" NOT just "vector databases"
479+
480+
✓ PRESERVE: Decision matrices and selection criteria
481+
Example: "Under 10K lines use X, 10-100K lines use Y, above 100K use Z with reason" NOT just "pick the right tool"
482+
483+
✓ PRESERVE: Technical architecture details
484+
Example: "768-1536 dimensional vectors with cosine similarity" NOT just "high-dimensional vectors"
485+
486+
✓ PRESERVE: Concrete examples with specific numbers
487+
Example: "10 chunks at 15K tokens plus 25K for files equals 40K" NOT just "uses a lot of tokens"
488+
489+
✗ DO NOT: Replace specific numbers with vague descriptors ("a lot", "many", "significant")
490+
✗ DO NOT: Skip tool names - always mention them with 1-sentence context of what they do
491+
✗ DO NOT: Simplify decision criteria into generic advice
492+
✗ DO NOT: Omit architectural details that explain how things work
493+
494+
EXAMPLE - BAD (too vague):
495+
"You need different approaches for different codebase sizes to get good results."
496+
497+
EXAMPLE - GOOD (preserves specifics):
498+
"For codebases under 10,000 lines, agentic search with Grep and Read works well. Between 10,000 and 100,000 lines,
499+
switch to semantic search - tools like ChunkHound or Claude Context via MCP servers. Above 100,000 lines, you need
500+
ChunkHound's structured multi-hop traversal because autonomous agents start missing connections."
501+
415502
CRITICAL: CONTENT DEDUPLICATION REQUIREMENTS
416503
417504
The source material uses pedagogical reinforcement patterns designed for written learning:
@@ -484,9 +571,14 @@ Alex: [natural dialog here]
484571
</podcast_dialog>${metaCommentary}
485572
486573
LENGTH CONSTRAINT:
487-
Target 6,000-7,500 tokens for the complete dialog. This ensures it fits within TTS API limits while maintaining quality.
488-
Given the deduplication requirements above, your podcast will likely be SHORTER than the source material - this is expected and desirable.
489-
Focus on depth and clarity over comprehensive coverage. Prioritize depth over breadth.
574+
Target ${targetTokens}-${targetTokens + 1500} tokens for the complete dialog (dynamically calculated based on source complexity).
575+
This lesson has ${sourceTokens} estimated source tokens with specific complexity factors considered.
576+
577+
IMPORTANT: Depth is prioritized over compression for this content.
578+
- Preserve ALL technical specifics, numbers, tool names, and decision criteria
579+
- The token budget is adaptive - complex lessons get more space to preserve detail
580+
- Deduplication is for removing redundancy, NOT for cutting essential technical information
581+
- Focus on making content clear and complete, not artificially short
490582
491583
TECHNICAL CONTENT TITLE: ${fileName}
492584
@@ -502,17 +594,17 @@ Just write the raw dialog to the file now.`;
502594
/**
503595
* Call Claude Code CLI in headless mode to generate dialog
504596
*/
505-
async function generateDialogWithClaude(prompt, outputPath) {
597+
async function generateDialogWithClaude(prompt, outputPath, model = 'haiku') {
506598
return new Promise((resolve, reject) => {
507-
console.log(` 🤖 Calling Claude Code CLI (Haiku 4.5)...`);
599+
console.log(` 🤖 Calling Claude Code CLI (${model})...`);
508600

509601
// Ensure output directory exists before Claude tries to write
510602
mkdirSync(dirname(outputPath), { recursive: true });
511603

512604
// Spawn claude process with headless mode
513605
const claude = spawn('claude', [
514606
'-p', // Headless mode (non-interactive)
515-
'--model', 'haiku', // Use Haiku 4.5
607+
'--model', model, // Use specified model (haiku or sonnet)
516608
'--allowedTools', 'Edit', 'Write' // Allow file editing and writing only
517609
]);
518610

@@ -587,6 +679,50 @@ async function generateDialogWithClaude(prompt, outputPath) {
587679
});
588680
}
589681

682+
/**
683+
* Validate technical depth and information preservation
684+
*/
685+
function validateTechnicalDepth(dialog, sourceContent) {
686+
const warnings = [];
687+
688+
// Extract numbers from source and dialog (including LOC like "10K", percentages, dimensions)
689+
const sourceNumbers = sourceContent.match(/\b\d+[KM]?(?:%|K|M|,\d{3})*\b/g) || [];
690+
const dialogNumbers = dialog.match(/\b\d+[KM]?(?:%|K|M|,\d{3})*\b/g) || [];
691+
692+
// Should preserve at least 40% of specific numbers
693+
if (dialogNumbers.length < sourceNumbers.length * 0.4) {
694+
warnings.push(
695+
`⚠️ Low number preservation: ${dialogNumbers.length}/${sourceNumbers.length} numbers mentioned ` +
696+
`(${((dialogNumbers.length / sourceNumbers.length) * 100).toFixed(0)}%)`
697+
);
698+
}
699+
700+
// Extract tool/product names (capitalized technical terms)
701+
const toolPattern = /\b(?:[A-Z][a-z]+(?:[A-Z][a-z]+)*(?:DB|RAG|Search|Agent|Hound|Seek|Context|MCP|Serena|Perplexity|ChunkHound|ArguSeek)|ChunkHound|ArguSeek|ChromaDB|pgvector|Qdrant)\b/g;
702+
const sourceTools = new Set(sourceContent.match(toolPattern) || []);
703+
const dialogTools = new Set(dialog.match(toolPattern) || []);
704+
705+
const missingTools = [...sourceTools].filter(t => !dialogTools.has(t));
706+
if (missingTools.length > sourceTools.size * 0.3) {
707+
warnings.push(
708+
`⚠️ Missing important tools: ${missingTools.slice(0, 5).join(', ')}` +
709+
`${missingTools.length > 5 ? ` (+ ${missingTools.length - 5} more)` : ''}`
710+
);
711+
}
712+
713+
// Check for decision matrices / thresholds (lines with multiple | symbols)
714+
const sourceTables = (sourceContent.match(/^\|.*\|.*\|/gm) || []).length;
715+
if (sourceTables > 0) {
716+
// Tables should be mentioned or narrated somehow
717+
const tableKeywords = /(matrix|table|comparison|threshold|scale|tier)/gi;
718+
if (!tableKeywords.test(dialog)) {
719+
warnings.push(`⚠️ Source contains ${sourceTables} table rows but podcast doesn't narrate them`);
720+
}
721+
}
722+
723+
return warnings;
724+
}
725+
590726
/**
591727
* Validate dialog for repetition patterns
592728
*/
@@ -1166,14 +1302,24 @@ async function generateScript(filePath, scriptManifest, config) {
11661302
console.log(`\n📄 Generating script: ${relativePath}`);
11671303

11681304
try {
1169-
// Parse content
1305+
// Read raw content first for complexity analysis
1306+
const rawContent = readFileSync(filePath, 'utf-8');
1307+
1308+
// Parse content for podcast generation
11701309
const content = parseMarkdownContent(filePath);
11711310

11721311
if (content.length < 100) {
11731312
console.log(` ⚠️ Skipping - content too short`);
11741313
return null;
11751314
}
11761315

1316+
// Calculate dynamic token budget based on RAW content complexity (before parsing)
1317+
const targetTokens = calculateTargetTokens(rawContent);
1318+
const sourceTokens = Math.ceil(rawContent.length / 4);
1319+
1320+
// Select appropriate model
1321+
const model = selectModel(targetTokens, sourceTokens);
1322+
11771323
// Determine output path
11781324
const outputFileName = `${fileName}.md`;
11791325
const outputPath = join(SCRIPT_OUTPUT_DIR, dirname(relativePath), outputFileName);
@@ -1184,8 +1330,8 @@ async function generateScript(filePath, scriptManifest, config) {
11841330
console.log(` 🗑️ Deleted existing file for fresh generation`);
11851331
}
11861332

1187-
// Build prompt
1188-
const prompt = buildDialogPrompt(content, fileName, outputPath);
1333+
// Build prompt with dynamic parameters
1334+
const prompt = buildDialogPrompt(content, fileName, outputPath, targetTokens, sourceTokens);
11891335

11901336
// Debug mode: save prompt
11911337
if (config.debug) {
@@ -1194,14 +1340,24 @@ async function generateScript(filePath, scriptManifest, config) {
11941340
console.log(` 🔍 Debug prompt saved: ${debugPath}`);
11951341
}
11961342

1197-
// Generate dialog using Claude
1198-
const dialog = await generateDialogWithClaude(prompt, outputPath);
1343+
// Generate dialog using Claude with selected model
1344+
const dialog = await generateDialogWithClaude(prompt, outputPath, model);
1345+
1346+
// Validate technical depth and information preservation (against raw content)
1347+
console.log(` 🔍 Validating technical depth...`);
1348+
const technicalWarnings = validateTechnicalDepth(dialog, rawContent);
1349+
if (technicalWarnings.length > 0) {
1350+
console.log(` ⚠️ Technical depth warnings:`);
1351+
technicalWarnings.forEach(w => console.log(` ${w}`));
1352+
} else {
1353+
console.log(` ✅ Technical depth validation passed`);
1354+
}
11991355

1200-
// Validate dialog quality
1201-
const warnings = validateDialogQuality(dialog);
1202-
if (warnings.length > 0) {
1356+
// Validate dialog quality (repetition)
1357+
const qualityWarnings = validateDialogQuality(dialog);
1358+
if (qualityWarnings.length > 0) {
12031359
console.log(` ⚠️ Quality warnings detected:`);
1204-
warnings.forEach(w => console.log(` - ${w}`));
1360+
qualityWarnings.forEach(w => console.log(` - ${w}`));
12051361
console.log(` 💡 Consider regenerating if repetition is significant`);
12061362
} else {
12071363
console.log(` ✅ Quality validation passed - no repetition detected`);

0 commit comments

Comments
 (0)