@@ -86,6 +86,63 @@ function parseArgs() {
8686// SCRIPT GENERATION - From generate-podcast-script.js
8787// ============================================================================
8888
89+ /**
90+ * Calculate semantic overlap between two text segments using word-based Jaccard similarity
91+ * This is a lightweight approach that doesn't require external NLP libraries
92+ *
93+ * @param {string } text1 - First text segment
94+ * @param {string } text2 - Second text segment
95+ * @param {number } threshold - Similarity threshold (0-1), default 0.75
96+ * @returns {boolean } - True if texts are semantically similar above threshold
97+ */
98+ function detectSemanticOverlap ( text1 , text2 , threshold = 0.75 ) {
99+ // Normalize: lowercase, remove punctuation, split into words
100+ const normalize = ( text ) => {
101+ return text
102+ . toLowerCase ( )
103+ . replace ( / [ ^ \w \s ] / g, ' ' )
104+ . split ( / \s + / )
105+ . filter ( word => word . length > 3 ) ; // Ignore short words (a, the, is, etc.)
106+ } ;
107+
108+ const words1 = new Set ( normalize ( text1 ) ) ;
109+ const words2 = new Set ( normalize ( text2 ) ) ;
110+
111+ if ( words1 . size === 0 || words2 . size === 0 ) {
112+ return false ;
113+ }
114+
115+ // Jaccard similarity: intersection / union
116+ const intersection = new Set ( [ ...words1 ] . filter ( word => words2 . has ( word ) ) ) ;
117+ const union = new Set ( [ ...words1 , ...words2 ] ) ;
118+
119+ const similarity = intersection . size / union . size ;
120+
121+ return similarity >= threshold ;
122+ }
123+
124+ /**
125+ * Extract unique sentences from text2 that are not semantically covered in text1
126+ * Used to preserve novel information from pedagogical notes
127+ *
128+ * @param {string } text1 - Main text (reference)
129+ * @param {string } text2 - Secondary text (to filter)
130+ * @returns {string } - Sentences from text2 not covered in text1
131+ */
132+ function extractUniqueSentences ( text1 , text2 ) {
133+ const sentences2 = text2 . split ( / [ . ! ? ] + / ) . map ( s => s . trim ( ) ) . filter ( s => s . length > 20 ) ;
134+ const uniqueSentences = [ ] ;
135+
136+ for ( const sentence of sentences2 ) {
137+ // Check if this sentence is already covered in text1
138+ if ( ! detectSemanticOverlap ( sentence , text1 , 0.6 ) ) {
139+ uniqueSentences . push ( sentence ) ;
140+ }
141+ }
142+
143+ return uniqueSentences . join ( '. ' ) ;
144+ }
145+
89146/**
90147 * Analyze code block context and generate audio-appropriate description
91148 * Transforms code into natural language that preserves pedagogical value
@@ -250,31 +307,148 @@ function parseMarkdownContent(filePath) {
250307 // Remove frontmatter
251308 let cleaned = content . replace ( / ^ - - - [ \s \S ] * ?- - - \n / , '' ) ;
252309
253- // Remove JSX components (simple approach - remove anything with <>)
310+ // DEDUPLICATION PHASE 1: Handle Deep Dive sections (<details> tags)
311+ // Do this BEFORE removing HTML tags so we can detect and process them
312+ // These often duplicate main explanations - extract and deduplicate first
313+ const detailsRegex = / < d e t a i l s > \s * < s u m m a r y > ( [ \s \S ] * ?) < \/ s u m m a r y > \s * ( [ \s \S ] * ?) < \/ d e t a i l s > / gi;
314+ let detailsMatch ;
315+ const detailsToProcess = [ ] ;
316+
317+ console . log ( ` 🔍 Scanning for Deep Dive sections...` ) ;
318+
319+ while ( ( detailsMatch = detailsRegex . exec ( cleaned ) ) !== null ) {
320+ detailsToProcess . push ( {
321+ fullMatch : detailsMatch [ 0 ] ,
322+ title : detailsMatch [ 1 ] . trim ( ) ,
323+ content : detailsMatch [ 2 ] . trim ( ) ,
324+ index : detailsMatch . index
325+ } ) ;
326+ }
327+
328+ // Process details sections in reverse order to maintain correct indices
329+ for ( let i = detailsToProcess . length - 1 ; i >= 0 ; i -- ) {
330+ const detail = detailsToProcess [ i ] ;
331+
332+ // Extract broader context (1000 chars before - deep dives cover broader topics)
333+ const contextStart = Math . max ( 0 , detail . index - 1000 ) ;
334+ const precedingContext = cleaned . substring ( contextStart , detail . index ) ;
335+
336+ // Check overlap with preceding content
337+ const overlapHigh = detectSemanticOverlap ( detail . content , precedingContext , 0.70 ) ;
338+ const overlapMedium = detectSemanticOverlap ( detail . content , precedingContext , 0.45 ) ;
339+
340+ let replacement ;
341+ if ( overlapHigh ) {
342+ // >70% overlap: Deep dive is redundant, remove entirely
343+ console . log ( ` 🔧 Deduplication: Removed redundant Deep Dive "${ detail . title } " (>70% overlap with main content)` ) ;
344+ replacement = '' ;
345+ } else if ( overlapMedium ) {
346+ // 45-70% overlap: Keep only unique sentences
347+ const uniqueContent = extractUniqueSentences ( precedingContext , detail . content ) ;
348+ if ( uniqueContent . length > 30 ) {
349+ console . log ( ` 🔧 Deduplication: Condensed Deep Dive "${ detail . title } " (45-70% overlap, kept unique parts)` ) ;
350+ replacement = `\n[DEEP DIVE: ${ detail . title } ]\n${ uniqueContent } \n[END DEEP DIVE]\n` ;
351+ } else {
352+ console . log ( ` 🔧 Deduplication: Removed Deep Dive "${ detail . title } " (no unique content after filtering)` ) ;
353+ replacement = '' ;
354+ }
355+ } else {
356+ // <45% overlap: Keep entire deep dive (genuinely new information)
357+ replacement = `\n[DEEP DIVE: ${ detail . title } ]\n${ detail . content } \n[END DEEP DIVE]\n` ;
358+ }
359+
360+ // Replace in cleaned content
361+ cleaned = cleaned . substring ( 0 , detail . index ) +
362+ replacement +
363+ cleaned . substring ( detail . index + detail . fullMatch . length ) ;
364+ }
365+
366+ console . log ( ` 🔍 Found ${ detailsToProcess . length } Deep Dive section(s)` ) ;
367+
368+ // DEDUPLICATION PHASE 2: Handle pedagogical note boxes (:::tip, :::warning, etc.)
369+ // Extract and deduplicate against surrounding context to prevent repetition
370+ console . log ( ` 🔍 Scanning for pedagogical note boxes...` ) ;
371+ // Match both formats: ":::tip Title" and ":::tip[Title]"
372+ const pedagogicalNoteRegex = / : : : ( t i p | w a r n i n g | i n f o | n o t e | c a u t i o n ) \s * (?: \[ ( [ ^ \] ] * ) \] | ( [ ^ \n ] * ) ) \s * \n ( [ \s \S ] * ?) \n : : : / gi;
373+ let noteMatch ;
374+ const notesToProcess = [ ] ;
375+
376+ while ( ( noteMatch = pedagogicalNoteRegex . exec ( cleaned ) ) !== null ) {
377+ notesToProcess . push ( {
378+ fullMatch : noteMatch [ 0 ] ,
379+ type : noteMatch [ 1 ] ,
380+ title : ( noteMatch [ 2 ] || noteMatch [ 3 ] || 'Note' ) . trim ( ) ,
381+ content : noteMatch [ 4 ] . trim ( ) ,
382+ index : noteMatch . index
383+ } ) ;
384+ }
385+
386+ // Process pedagogical notes in reverse order to maintain correct indices
387+ for ( let i = notesToProcess . length - 1 ; i >= 0 ; i -- ) {
388+ const note = notesToProcess [ i ] ;
389+
390+ // Extract surrounding context (500 chars before and after)
391+ const contextStart = Math . max ( 0 , note . index - 500 ) ;
392+ const contextEnd = Math . min ( cleaned . length , note . index + note . fullMatch . length + 500 ) ;
393+ const surroundingContext = cleaned . substring ( contextStart , note . index ) +
394+ cleaned . substring ( note . index + note . fullMatch . length , contextEnd ) ;
395+
396+ // Check overlap with surrounding context
397+ const overlapHigh = detectSemanticOverlap ( note . content , surroundingContext , 0.75 ) ;
398+ const overlapMedium = detectSemanticOverlap ( note . content , surroundingContext , 0.50 ) ;
399+
400+ let replacement ;
401+ if ( overlapHigh ) {
402+ // >75% overlap: Completely redundant, remove entirely
403+ console . log ( ` 🔧 Deduplication: Removed redundant ${ note . type } note (>75% overlap)` ) ;
404+ replacement = '' ;
405+ } else if ( overlapMedium ) {
406+ // 50-75% overlap: Keep only unique sentences
407+ const uniqueContent = extractUniqueSentences ( surroundingContext , note . content ) ;
408+ if ( uniqueContent . length > 20 ) {
409+ console . log ( ` 🔧 Deduplication: Condensed ${ note . type } note (50-75% overlap, kept unique parts)` ) ;
410+ replacement = `\n[PEDAGOGICAL ${ note . type . toUpperCase ( ) } : ${ note . title } ] ${ uniqueContent } \n` ;
411+ } else {
412+ console . log ( ` 🔧 Deduplication: Removed ${ note . type } note (no unique content after filtering)` ) ;
413+ replacement = '' ;
414+ }
415+ } else {
416+ // <50% overlap: Keep entire note (genuinely new information)
417+ replacement = `\n[PEDAGOGICAL ${ note . type . toUpperCase ( ) } : ${ note . title } ]\n${ note . content } \n[END NOTE]\n` ;
418+ }
419+
420+ // Replace in cleaned content
421+ cleaned = cleaned . substring ( 0 , note . index ) +
422+ replacement +
423+ cleaned . substring ( note . index + note . fullMatch . length ) ;
424+ }
425+
426+ console . log ( ` 🔍 Found ${ notesToProcess . length } pedagogical note(s)` ) ;
427+
428+ // NOW safe to remove remaining JSX/HTML components after deduplication
254429 cleaned = cleaned . replace ( / < [ ^ > ] + > / g, '' ) ;
255430
256- // First pass: Find all code blocks and their contexts BEFORE transformation
257- // This prevents context pollution from replaced blocks
431+ // Process code blocks: Find all code blocks and their contexts
258432 const codeBlocks = [ ] ;
259- const regex = / ` ` ` [ \s \S ] * ?` ` ` / g;
260- let match ;
433+ const codeRegex = / ` ` ` [ \s \S ] * ?` ` ` / g;
434+ let codeMatch ;
261435
262- while ( ( match = regex . exec ( cleaned ) ) !== null ) {
263- const precedingStart = Math . max ( 0 , match . index - 200 ) ;
264- const precedingContext = cleaned . substring ( precedingStart , match . index ) ;
436+ while ( ( codeMatch = codeRegex . exec ( cleaned ) ) !== null ) {
437+ const precedingStart = Math . max ( 0 , codeMatch . index - 200 ) ;
438+ const precedingContext = cleaned . substring ( precedingStart , codeMatch . index ) ;
265439
266- const followingEnd = Math . min ( cleaned . length , match . index + match [ 0 ] . length + 200 ) ;
267- const followingContext = cleaned . substring ( match . index + match [ 0 ] . length , followingEnd ) ;
440+ const followingEnd = Math . min ( cleaned . length , codeMatch . index + codeMatch [ 0 ] . length + 200 ) ;
441+ const followingContext = cleaned . substring ( codeMatch . index + codeMatch [ 0 ] . length , followingEnd ) ;
268442
269443 codeBlocks . push ( {
270- original : match [ 0 ] ,
271- index : match . index ,
444+ original : codeMatch [ 0 ] ,
445+ index : codeMatch . index ,
272446 precedingContext,
273447 followingContext
274448 } ) ;
275449 }
276450
277- // Second pass: Replace code blocks with descriptions
451+ // Replace code blocks with descriptions
278452 let offset = 0 ;
279453 for ( const block of codeBlocks ) {
280454 const description = describeCodeBlock ( block . original , block . precedingContext , block . followingContext ) ;
@@ -299,14 +473,6 @@ function parseMarkdownContent(filePath) {
299473 // Remove HTML comments
300474 cleaned = cleaned . replace ( / < ! - - [ \s \S ] * ?- - > / g, '' ) ;
301475
302- // Tag admonition boxes so Claude knows they're pedagogical reinforcement
303- // Match :::tip[Title], :::warning, :::info, etc.
304- cleaned = cleaned . replace ( / : : : ( t i p | w a r n i n g | i n f o | n o t e | c a u t i o n ) \s * (?: \[ ( [ ^ \] ] * ) \] ) ? \s * / gi, ( match , type , title ) => {
305- return `\n[PEDAGOGICAL ${ type . toUpperCase ( ) } : ${ title || 'Note' } ]\n` ;
306- } ) ;
307- // Mark end of admonition boxes
308- cleaned = cleaned . replace ( / ^ : : : $ / gm, '\n[END NOTE]\n' ) ;
309-
310476 // Clean up excessive whitespace
311477 cleaned = cleaned . replace ( / \n { 3 , } / g, '\n\n' ) . trim ( ) ;
312478
@@ -499,63 +665,21 @@ EXAMPLE - GOOD (preserves specifics):
499665switch to semantic search - tools like ChunkHound or Claude Context via MCP servers. Above 100,000 lines, you need
500666ChunkHound's structured multi-hop traversal because autonomous agents start missing connections."
501667
502- CRITICAL: CONTENT DEDUPLICATION REQUIREMENTS
503-
504- The source material uses pedagogical reinforcement patterns designed for written learning:
505- - Main explanatory text
506- - [PEDAGOGICAL TIP/WARNING/INFO/NOTE] boxes that often RESTATE key concepts from the main text
507- - Repeated summaries and transitions for visual learners
508-
509- For linear podcast format, you MUST deduplicate to avoid boring repetition:
510-
511- ✓ SYNTHESIZE: When a concept appears in both main text and [PEDAGOGICAL NOTE] sections, create ONE clear, cohesive explanation that integrates both perspectives naturally
512- ✓ MERGE: Combine multiple mentions of the same idea into a single well-developed discussion with depth
513- ✓ ADVANCE: Only return to a concept if you're significantly advancing understanding with genuinely new context, examples, or implications
514- ✓ FLOW: Prioritize natural conversational progression over written pedagogical reinforcement patterns
515- ✓ TRIM: Remove redundant restatements - podcast listeners cannot re-read like readers can, so repetition feels tedious rather than reinforcing
516-
517- ✗ DO NOT: Discuss the same point multiple times without adding substantial new insight
518- ✗ DO NOT: Treat [PEDAGOGICAL NOTE] sections as separate topics requiring their own discussion - they're reinforcement of concepts already covered in main text
519- ✗ DO NOT: Create circular discussions that return to the same idea repeatedly without meaningful progression
520- ✗ DO NOT: Feel obligated to cover every sentence - synthesize into the most compelling narrative
521- ✗ DO NOT: Use transition phrases like "going back to", "as I mentioned", "to circle back" - these signal repetition
522-
523- DETECT REPETITION PATTERNS - CRITICAL:
524-
525- Pattern 1: Main text + [PEDAGOGICAL TIP] saying the same thing differently
526- Example:
527- - Main text: "The real productivity gain is working on multiple projects simultaneously"
528- - [PEDAGOGICAL TIP]: "Autonomous mode's power is parallel work, not speed per task"
529- → These are IDENTICAL concepts with different wording. Merge into ONE statement.
530- → DO NOT discuss "productivity gain" then later return to "parallel work advantage"
531-
532- Pattern 2: Synonym clusters that mean the same thing
533- WATCH FOR THESE EQUIVALENT PHRASES:
534- - "parallel work" = "working on multiple tasks simultaneously" = "three agents running" = "concurrent projects"
535- - "10x productivity" = "actual game changer" = "real gain" = "where productivity explodes" = "genuine productivity improvement"
536- - "speed per task" vs "throughput" vs "finishing faster" = same concept, different framing
537-
538- ANTI-REPETITION RULE:
539- If you mention "parallel work" in one exchange, DO NOT return to "working on multiple tasks simultaneously"
540- as if it's a new topic. You already covered it. Move forward to genuinely new insights.
541-
542- Pattern 3: Circular explanations
543- BAD (circular):
544- - Exchange 1: "Autonomous mode's advantage is you can work on three projects at once"
545- - Exchange 3: "The real productivity gain isn't about speed"
546- - Exchange 4: "The game changer is parallel work - running multiple agents simultaneously"
547- → This repeats the same point 3 times within close proximity
548-
549- GOOD (progressive):
550- - Exchange 1: "Autonomous mode's advantage is parallel work - three projects simultaneously while living your life"
551- - [Move to next concept - don't return to parallel work unless adding 50%+ new insight]
552-
553- EXAMPLE OF GOOD DEDUPLICATION:
554- If main text explains "autonomous mode lets you work on multiple projects simultaneously" and a
555- [PEDAGOGICAL TIP] says "the real 10x productivity gain is parallel work, not speed per task",
556- synthesize these into ONE cohesive explanation: "Autonomous mode's real power isn't speed - it's
557- working on three projects simultaneously while living your life. That's the actual 10x gain."
558- Don't discuss autonomous mode, then later discuss "the real 10x gain" as if it's a separate concept.
668+ CRITICAL: CONTENT HAS BEEN PRE-DEDUPLICATED
669+
670+ The source content has been programmatically deduplicated during preprocessing:
671+ - Redundant pedagogical note boxes (:::tip, :::warning) have been removed or condensed
672+ - Duplicate deep dive sections have been filtered out
673+ - Only unique information remains in [PEDAGOGICAL NOTE] and [DEEP DIVE] tags
674+
675+ Your job is to transform this already-clean content into engaging dialog:
676+
677+ ✓ TRUST THE PREPROCESSING: Content is already deduplicated - focus on dialog quality
678+ ✓ NATURAL FLOW: Create conversational progression without forced repetition checks
679+ ✓ AVOID CIRCULAR PHRASES: Don't use "going back to", "as I mentioned", "to circle back"
680+ ✓ PROGRESSIVE DISCUSSION: Each exchange should advance understanding, not restate
681+
682+ The heavy lifting of deduplication is done. Focus on creating engaging, technically accurate dialog.
559683
560684OUTPUT FORMAT:
561685Use clear speaker labels followed by natural dialog. Structure your output within XML tags:
@@ -724,7 +848,7 @@ function validateTechnicalDepth(dialog, sourceContent) {
724848}
725849
726850/**
727- * Validate dialog for repetition patterns
851+ * Validate dialog for repetition patterns using semantic similarity
728852 */
729853function validateDialogQuality ( dialog ) {
730854 const warnings = [ ] ;
@@ -736,34 +860,30 @@ function validateDialogQuality(dialog) {
736860 return warnings ; // Can't validate empty dialog
737861 }
738862
739- // Key phrases to watch for repetition (case-insensitive)
740- const keyPhrases = [
741- 'parallel work' ,
742- '10x productivity' ,
743- 'autonomous mode' ,
744- 'game changer' ,
745- 'real gain' ,
746- 'actual productivity' ,
747- 'speed per task' ,
748- 'throughput' ,
749- 'multiple projects' ,
750- 'three agents'
751- ] ;
863+ // SEMANTIC REPETITION DETECTION: Check for exchanges covering the same concepts
864+ // Use sliding window of 10 exchanges (broader than old 5-exchange window)
865+ const windowSize = 10 ;
866+ const similarityThreshold = 0.65 ; // 65% semantic overlap = likely repetition
867+
868+ for ( let i = 0 ; i < exchanges . length - 2 ; i ++ ) {
869+ const currentExchange = exchanges [ i ] ;
752870
753- // Check for repeated key phrases within close proximity (sliding window of 5 exchanges)
754- const windowSize = 5 ;
755- for ( let i = 0 ; i < exchanges . length - windowSize + 1 ; i ++ ) {
756- const window = exchanges . slice ( i , i + windowSize ) . join ( ' ' ) . toLowerCase ( ) ;
871+ // Check if this exchange is semantically similar to any of the next few exchanges
872+ for ( let j = i + 2 ; j < Math . min ( i + windowSize , exchanges . length ) ; j ++ ) {
873+ const laterExchange = exchanges [ j ] ;
757874
758- for ( const phrase of keyPhrases ) {
759- const regex = new RegExp ( phrase , 'gi' ) ;
760- const occurrences = ( window . match ( regex ) || [ ] ) . length ;
875+ if ( detectSemanticOverlap ( currentExchange , laterExchange , similarityThreshold ) ) {
876+ // Extract preview of both exchanges (first 60 chars)
877+ const preview1 = currentExchange . substring ( 0 , 60 ) . replace ( / \n / g, ' ' ) + '...' ;
878+ const preview2 = laterExchange . substring ( 0 , 60 ) . replace ( / \n / g, ' ' ) + '...' ;
761879
762- if ( occurrences >= 3 ) {
763880 warnings . push (
764- `Potential repetition: "${ phrase } " appears ${ occurrences } times within 5 exchanges ` +
765- `(exchanges ${ i + 1 } -${ i + windowSize } )`
881+ `⚠️ Semantic repetition detected:\n` +
882+ ` Exchange ${ i + 1 } : "${ preview1 } "\n` +
883+ ` Exchange ${ j + 1 } : "${ preview2 } "\n` +
884+ ` (>65% semantic overlap - likely discussing same concept)`
766885 ) ;
886+ break ; // Only report first occurrence for this exchange
767887 }
768888 }
769889 }
@@ -782,7 +902,7 @@ function validateDialogQuality(dialog) {
782902 for ( const phrase of circularPhrases ) {
783903 if ( fullText . includes ( phrase ) ) {
784904 warnings . push (
785- `Circular transition detected: "${ phrase } " - this often signals unnecessary repetition`
905+ `⚠️ Circular transition detected: "${ phrase } " - this often signals unnecessary repetition`
786906 ) ;
787907 }
788908 }
0 commit comments