Skip to content

Commit e8cdd62

Browse files
committed
Updated lesson 5 podcast + reduced repetitions
1 parent e4632df commit e8cdd62

File tree

5 files changed

+301
-165
lines changed

5 files changed

+301
-165
lines changed

scripts/generate-podcast.js

Lines changed: 223 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,63 @@ function parseArgs() {
8686
// SCRIPT GENERATION - From generate-podcast-script.js
8787
// ============================================================================
8888

89+
/**
90+
* Calculate semantic overlap between two text segments using word-based Jaccard similarity
91+
* This is a lightweight approach that doesn't require external NLP libraries
92+
*
93+
* @param {string} text1 - First text segment
94+
* @param {string} text2 - Second text segment
95+
* @param {number} threshold - Similarity threshold (0-1), default 0.75
96+
* @returns {boolean} - True if texts are semantically similar above threshold
97+
*/
98+
function detectSemanticOverlap(text1, text2, threshold = 0.75) {
99+
// Normalize: lowercase, remove punctuation, split into words
100+
const normalize = (text) => {
101+
return text
102+
.toLowerCase()
103+
.replace(/[^\w\s]/g, ' ')
104+
.split(/\s+/)
105+
.filter(word => word.length > 3); // Ignore short words (a, the, is, etc.)
106+
};
107+
108+
const words1 = new Set(normalize(text1));
109+
const words2 = new Set(normalize(text2));
110+
111+
if (words1.size === 0 || words2.size === 0) {
112+
return false;
113+
}
114+
115+
// Jaccard similarity: intersection / union
116+
const intersection = new Set([...words1].filter(word => words2.has(word)));
117+
const union = new Set([...words1, ...words2]);
118+
119+
const similarity = intersection.size / union.size;
120+
121+
return similarity >= threshold;
122+
}
123+
124+
/**
125+
* Extract unique sentences from text2 that are not semantically covered in text1
126+
* Used to preserve novel information from pedagogical notes
127+
*
128+
* @param {string} text1 - Main text (reference)
129+
* @param {string} text2 - Secondary text (to filter)
130+
* @returns {string} - Sentences from text2 not covered in text1
131+
*/
132+
function extractUniqueSentences(text1, text2) {
133+
const sentences2 = text2.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
134+
const uniqueSentences = [];
135+
136+
for (const sentence of sentences2) {
137+
// Check if this sentence is already covered in text1
138+
if (!detectSemanticOverlap(sentence, text1, 0.6)) {
139+
uniqueSentences.push(sentence);
140+
}
141+
}
142+
143+
return uniqueSentences.join('. ');
144+
}
145+
89146
/**
90147
* Analyze code block context and generate audio-appropriate description
91148
* Transforms code into natural language that preserves pedagogical value
@@ -250,31 +307,148 @@ function parseMarkdownContent(filePath) {
250307
// Remove frontmatter
251308
let cleaned = content.replace(/^---[\s\S]*?---\n/, '');
252309

253-
// Remove JSX components (simple approach - remove anything with <>)
310+
// DEDUPLICATION PHASE 1: Handle Deep Dive sections (<details> tags)
311+
// Do this BEFORE removing HTML tags so we can detect and process them
312+
// These often duplicate main explanations - extract and deduplicate first
313+
const detailsRegex = /<details>\s*<summary>([\s\S]*?)<\/summary>\s*([\s\S]*?)<\/details>/gi;
314+
let detailsMatch;
315+
const detailsToProcess = [];
316+
317+
console.log(` 🔍 Scanning for Deep Dive sections...`);
318+
319+
while ((detailsMatch = detailsRegex.exec(cleaned)) !== null) {
320+
detailsToProcess.push({
321+
fullMatch: detailsMatch[0],
322+
title: detailsMatch[1].trim(),
323+
content: detailsMatch[2].trim(),
324+
index: detailsMatch.index
325+
});
326+
}
327+
328+
// Process details sections in reverse order to maintain correct indices
329+
for (let i = detailsToProcess.length - 1; i >= 0; i--) {
330+
const detail = detailsToProcess[i];
331+
332+
// Extract broader context (1000 chars before - deep dives cover broader topics)
333+
const contextStart = Math.max(0, detail.index - 1000);
334+
const precedingContext = cleaned.substring(contextStart, detail.index);
335+
336+
// Check overlap with preceding content
337+
const overlapHigh = detectSemanticOverlap(detail.content, precedingContext, 0.70);
338+
const overlapMedium = detectSemanticOverlap(detail.content, precedingContext, 0.45);
339+
340+
let replacement;
341+
if (overlapHigh) {
342+
// >70% overlap: Deep dive is redundant, remove entirely
343+
console.log(` 🔧 Deduplication: Removed redundant Deep Dive "${detail.title}" (>70% overlap with main content)`);
344+
replacement = '';
345+
} else if (overlapMedium) {
346+
// 45-70% overlap: Keep only unique sentences
347+
const uniqueContent = extractUniqueSentences(precedingContext, detail.content);
348+
if (uniqueContent.length > 30) {
349+
console.log(` 🔧 Deduplication: Condensed Deep Dive "${detail.title}" (45-70% overlap, kept unique parts)`);
350+
replacement = `\n[DEEP DIVE: ${detail.title}]\n${uniqueContent}\n[END DEEP DIVE]\n`;
351+
} else {
352+
console.log(` 🔧 Deduplication: Removed Deep Dive "${detail.title}" (no unique content after filtering)`);
353+
replacement = '';
354+
}
355+
} else {
356+
// <45% overlap: Keep entire deep dive (genuinely new information)
357+
replacement = `\n[DEEP DIVE: ${detail.title}]\n${detail.content}\n[END DEEP DIVE]\n`;
358+
}
359+
360+
// Replace in cleaned content
361+
cleaned = cleaned.substring(0, detail.index) +
362+
replacement +
363+
cleaned.substring(detail.index + detail.fullMatch.length);
364+
}
365+
366+
console.log(` 🔍 Found ${detailsToProcess.length} Deep Dive section(s)`);
367+
368+
// DEDUPLICATION PHASE 2: Handle pedagogical note boxes (:::tip, :::warning, etc.)
369+
// Extract and deduplicate against surrounding context to prevent repetition
370+
console.log(` 🔍 Scanning for pedagogical note boxes...`);
371+
// Match both formats: ":::tip Title" and ":::tip[Title]"
372+
const pedagogicalNoteRegex = /:::(tip|warning|info|note|caution)\s*(?:\[([^\]]*)\]|([^\n]*))\s*\n([\s\S]*?)\n:::/gi;
373+
let noteMatch;
374+
const notesToProcess = [];
375+
376+
while ((noteMatch = pedagogicalNoteRegex.exec(cleaned)) !== null) {
377+
notesToProcess.push({
378+
fullMatch: noteMatch[0],
379+
type: noteMatch[1],
380+
title: (noteMatch[2] || noteMatch[3] || 'Note').trim(),
381+
content: noteMatch[4].trim(),
382+
index: noteMatch.index
383+
});
384+
}
385+
386+
// Process pedagogical notes in reverse order to maintain correct indices
387+
for (let i = notesToProcess.length - 1; i >= 0; i--) {
388+
const note = notesToProcess[i];
389+
390+
// Extract surrounding context (500 chars before and after)
391+
const contextStart = Math.max(0, note.index - 500);
392+
const contextEnd = Math.min(cleaned.length, note.index + note.fullMatch.length + 500);
393+
const surroundingContext = cleaned.substring(contextStart, note.index) +
394+
cleaned.substring(note.index + note.fullMatch.length, contextEnd);
395+
396+
// Check overlap with surrounding context
397+
const overlapHigh = detectSemanticOverlap(note.content, surroundingContext, 0.75);
398+
const overlapMedium = detectSemanticOverlap(note.content, surroundingContext, 0.50);
399+
400+
let replacement;
401+
if (overlapHigh) {
402+
// >75% overlap: Completely redundant, remove entirely
403+
console.log(` 🔧 Deduplication: Removed redundant ${note.type} note (>75% overlap)`);
404+
replacement = '';
405+
} else if (overlapMedium) {
406+
// 50-75% overlap: Keep only unique sentences
407+
const uniqueContent = extractUniqueSentences(surroundingContext, note.content);
408+
if (uniqueContent.length > 20) {
409+
console.log(` 🔧 Deduplication: Condensed ${note.type} note (50-75% overlap, kept unique parts)`);
410+
replacement = `\n[PEDAGOGICAL ${note.type.toUpperCase()}: ${note.title}] ${uniqueContent}\n`;
411+
} else {
412+
console.log(` 🔧 Deduplication: Removed ${note.type} note (no unique content after filtering)`);
413+
replacement = '';
414+
}
415+
} else {
416+
// <50% overlap: Keep entire note (genuinely new information)
417+
replacement = `\n[PEDAGOGICAL ${note.type.toUpperCase()}: ${note.title}]\n${note.content}\n[END NOTE]\n`;
418+
}
419+
420+
// Replace in cleaned content
421+
cleaned = cleaned.substring(0, note.index) +
422+
replacement +
423+
cleaned.substring(note.index + note.fullMatch.length);
424+
}
425+
426+
console.log(` 🔍 Found ${notesToProcess.length} pedagogical note(s)`);
427+
428+
// NOW safe to remove remaining JSX/HTML components after deduplication
254429
cleaned = cleaned.replace(/<[^>]+>/g, '');
255430

256-
// First pass: Find all code blocks and their contexts BEFORE transformation
257-
// This prevents context pollution from replaced blocks
431+
// Process code blocks: Find all code blocks and their contexts
258432
const codeBlocks = [];
259-
const regex = /```[\s\S]*?```/g;
260-
let match;
433+
const codeRegex = /```[\s\S]*?```/g;
434+
let codeMatch;
261435

262-
while ((match = regex.exec(cleaned)) !== null) {
263-
const precedingStart = Math.max(0, match.index - 200);
264-
const precedingContext = cleaned.substring(precedingStart, match.index);
436+
while ((codeMatch = codeRegex.exec(cleaned)) !== null) {
437+
const precedingStart = Math.max(0, codeMatch.index - 200);
438+
const precedingContext = cleaned.substring(precedingStart, codeMatch.index);
265439

266-
const followingEnd = Math.min(cleaned.length, match.index + match[0].length + 200);
267-
const followingContext = cleaned.substring(match.index + match[0].length, followingEnd);
440+
const followingEnd = Math.min(cleaned.length, codeMatch.index + codeMatch[0].length + 200);
441+
const followingContext = cleaned.substring(codeMatch.index + codeMatch[0].length, followingEnd);
268442

269443
codeBlocks.push({
270-
original: match[0],
271-
index: match.index,
444+
original: codeMatch[0],
445+
index: codeMatch.index,
272446
precedingContext,
273447
followingContext
274448
});
275449
}
276450

277-
// Second pass: Replace code blocks with descriptions
451+
// Replace code blocks with descriptions
278452
let offset = 0;
279453
for (const block of codeBlocks) {
280454
const description = describeCodeBlock(block.original, block.precedingContext, block.followingContext);
@@ -299,14 +473,6 @@ function parseMarkdownContent(filePath) {
299473
// Remove HTML comments
300474
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
301475

302-
// Tag admonition boxes so Claude knows they're pedagogical reinforcement
303-
// Match :::tip[Title], :::warning, :::info, etc.
304-
cleaned = cleaned.replace(/:::(tip|warning|info|note|caution)\s*(?:\[([^\]]*)\])?\s*/gi, (match, type, title) => {
305-
return `\n[PEDAGOGICAL ${type.toUpperCase()}: ${title || 'Note'}]\n`;
306-
});
307-
// Mark end of admonition boxes
308-
cleaned = cleaned.replace(/^:::$/gm, '\n[END NOTE]\n');
309-
310476
// Clean up excessive whitespace
311477
cleaned = cleaned.replace(/\n{3,}/g, '\n\n').trim();
312478

@@ -499,63 +665,21 @@ EXAMPLE - GOOD (preserves specifics):
499665
switch to semantic search - tools like ChunkHound or Claude Context via MCP servers. Above 100,000 lines, you need
500666
ChunkHound's structured multi-hop traversal because autonomous agents start missing connections."
501667
502-
CRITICAL: CONTENT DEDUPLICATION REQUIREMENTS
503-
504-
The source material uses pedagogical reinforcement patterns designed for written learning:
505-
- Main explanatory text
506-
- [PEDAGOGICAL TIP/WARNING/INFO/NOTE] boxes that often RESTATE key concepts from the main text
507-
- Repeated summaries and transitions for visual learners
508-
509-
For linear podcast format, you MUST deduplicate to avoid boring repetition:
510-
511-
✓ SYNTHESIZE: When a concept appears in both main text and [PEDAGOGICAL NOTE] sections, create ONE clear, cohesive explanation that integrates both perspectives naturally
512-
✓ MERGE: Combine multiple mentions of the same idea into a single well-developed discussion with depth
513-
✓ ADVANCE: Only return to a concept if you're significantly advancing understanding with genuinely new context, examples, or implications
514-
✓ FLOW: Prioritize natural conversational progression over written pedagogical reinforcement patterns
515-
✓ TRIM: Remove redundant restatements - podcast listeners cannot re-read like readers can, so repetition feels tedious rather than reinforcing
516-
517-
✗ DO NOT: Discuss the same point multiple times without adding substantial new insight
518-
✗ DO NOT: Treat [PEDAGOGICAL NOTE] sections as separate topics requiring their own discussion - they're reinforcement of concepts already covered in main text
519-
✗ DO NOT: Create circular discussions that return to the same idea repeatedly without meaningful progression
520-
✗ DO NOT: Feel obligated to cover every sentence - synthesize into the most compelling narrative
521-
✗ DO NOT: Use transition phrases like "going back to", "as I mentioned", "to circle back" - these signal repetition
522-
523-
DETECT REPETITION PATTERNS - CRITICAL:
524-
525-
Pattern 1: Main text + [PEDAGOGICAL TIP] saying the same thing differently
526-
Example:
527-
- Main text: "The real productivity gain is working on multiple projects simultaneously"
528-
- [PEDAGOGICAL TIP]: "Autonomous mode's power is parallel work, not speed per task"
529-
→ These are IDENTICAL concepts with different wording. Merge into ONE statement.
530-
→ DO NOT discuss "productivity gain" then later return to "parallel work advantage"
531-
532-
Pattern 2: Synonym clusters that mean the same thing
533-
WATCH FOR THESE EQUIVALENT PHRASES:
534-
- "parallel work" = "working on multiple tasks simultaneously" = "three agents running" = "concurrent projects"
535-
- "10x productivity" = "actual game changer" = "real gain" = "where productivity explodes" = "genuine productivity improvement"
536-
- "speed per task" vs "throughput" vs "finishing faster" = same concept, different framing
537-
538-
ANTI-REPETITION RULE:
539-
If you mention "parallel work" in one exchange, DO NOT return to "working on multiple tasks simultaneously"
540-
as if it's a new topic. You already covered it. Move forward to genuinely new insights.
541-
542-
Pattern 3: Circular explanations
543-
BAD (circular):
544-
- Exchange 1: "Autonomous mode's advantage is you can work on three projects at once"
545-
- Exchange 3: "The real productivity gain isn't about speed"
546-
- Exchange 4: "The game changer is parallel work - running multiple agents simultaneously"
547-
→ This repeats the same point 3 times within close proximity
548-
549-
GOOD (progressive):
550-
- Exchange 1: "Autonomous mode's advantage is parallel work - three projects simultaneously while living your life"
551-
- [Move to next concept - don't return to parallel work unless adding 50%+ new insight]
552-
553-
EXAMPLE OF GOOD DEDUPLICATION:
554-
If main text explains "autonomous mode lets you work on multiple projects simultaneously" and a
555-
[PEDAGOGICAL TIP] says "the real 10x productivity gain is parallel work, not speed per task",
556-
synthesize these into ONE cohesive explanation: "Autonomous mode's real power isn't speed - it's
557-
working on three projects simultaneously while living your life. That's the actual 10x gain."
558-
Don't discuss autonomous mode, then later discuss "the real 10x gain" as if it's a separate concept.
668+
CRITICAL: CONTENT HAS BEEN PRE-DEDUPLICATED
669+
670+
The source content has been programmatically deduplicated during preprocessing:
671+
- Redundant pedagogical note boxes (:::tip, :::warning) have been removed or condensed
672+
- Duplicate deep dive sections have been filtered out
673+
- Only unique information remains in [PEDAGOGICAL NOTE] and [DEEP DIVE] tags
674+
675+
Your job is to transform this already-clean content into engaging dialog:
676+
677+
✓ TRUST THE PREPROCESSING: Content is already deduplicated - focus on dialog quality
678+
✓ NATURAL FLOW: Create conversational progression without forced repetition checks
679+
✓ AVOID CIRCULAR PHRASES: Don't use "going back to", "as I mentioned", "to circle back"
680+
✓ PROGRESSIVE DISCUSSION: Each exchange should advance understanding, not restate
681+
682+
The heavy lifting of deduplication is done. Focus on creating engaging, technically accurate dialog.
559683
560684
OUTPUT FORMAT:
561685
Use clear speaker labels followed by natural dialog. Structure your output within XML tags:
@@ -724,7 +848,7 @@ function validateTechnicalDepth(dialog, sourceContent) {
724848
}
725849

726850
/**
727-
* Validate dialog for repetition patterns
851+
* Validate dialog for repetition patterns using semantic similarity
728852
*/
729853
function validateDialogQuality(dialog) {
730854
const warnings = [];
@@ -736,34 +860,30 @@ function validateDialogQuality(dialog) {
736860
return warnings; // Can't validate empty dialog
737861
}
738862

739-
// Key phrases to watch for repetition (case-insensitive)
740-
const keyPhrases = [
741-
'parallel work',
742-
'10x productivity',
743-
'autonomous mode',
744-
'game changer',
745-
'real gain',
746-
'actual productivity',
747-
'speed per task',
748-
'throughput',
749-
'multiple projects',
750-
'three agents'
751-
];
863+
// SEMANTIC REPETITION DETECTION: Check for exchanges covering the same concepts
864+
// Use sliding window of 10 exchanges (broader than old 5-exchange window)
865+
const windowSize = 10;
866+
const similarityThreshold = 0.65; // 65% semantic overlap = likely repetition
867+
868+
for (let i = 0; i < exchanges.length - 2; i++) {
869+
const currentExchange = exchanges[i];
752870

753-
// Check for repeated key phrases within close proximity (sliding window of 5 exchanges)
754-
const windowSize = 5;
755-
for (let i = 0; i < exchanges.length - windowSize + 1; i++) {
756-
const window = exchanges.slice(i, i + windowSize).join(' ').toLowerCase();
871+
// Check if this exchange is semantically similar to any of the next few exchanges
872+
for (let j = i + 2; j < Math.min(i + windowSize, exchanges.length); j++) {
873+
const laterExchange = exchanges[j];
757874

758-
for (const phrase of keyPhrases) {
759-
const regex = new RegExp(phrase, 'gi');
760-
const occurrences = (window.match(regex) || []).length;
875+
if (detectSemanticOverlap(currentExchange, laterExchange, similarityThreshold)) {
876+
// Extract preview of both exchanges (first 60 chars)
877+
const preview1 = currentExchange.substring(0, 60).replace(/\n/g, ' ') + '...';
878+
const preview2 = laterExchange.substring(0, 60).replace(/\n/g, ' ') + '...';
761879

762-
if (occurrences >= 3) {
763880
warnings.push(
764-
`Potential repetition: "${phrase}" appears ${occurrences} times within 5 exchanges ` +
765-
`(exchanges ${i + 1}-${i + windowSize})`
881+
`⚠️ Semantic repetition detected:\n` +
882+
` Exchange ${i + 1}: "${preview1}"\n` +
883+
` Exchange ${j + 1}: "${preview2}"\n` +
884+
` (>65% semantic overlap - likely discussing same concept)`
766885
);
886+
break; // Only report first occurrence for this exchange
767887
}
768888
}
769889
}
@@ -782,7 +902,7 @@ function validateDialogQuality(dialog) {
782902
for (const phrase of circularPhrases) {
783903
if (fullText.includes(phrase)) {
784904
warnings.push(
785-
`Circular transition detected: "${phrase}" - this often signals unnecessary repetition`
905+
`⚠️ Circular transition detected: "${phrase}" - this often signals unnecessary repetition`
786906
);
787907
}
788908
}

0 commit comments

Comments
 (0)