Add radio show audio processor and post-show workflow

- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU), diarize (pyannote), detect segments (multi-signal classifier), remove commercials, split segments, analyze content (Ollama) - Post-show workflow doc for episode posts, forum threads, deep-dive blog posts - Training plan for using 579-episode archive for voice profiles and commercial detection - Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti - Sample transcript output from S7E30 (March 2015) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 11:51:59 -07:00
parent a8c8c6b7b6
commit a1e0442d8b
17 changed files with 58344 additions and 0 deletions
--- a/projects/radio-show/audio-processor/src/segment_detector.py
+++ b/projects/radio-show/audio-processor/src/segment_detector.py
@@ -0,0 +1,419 @@
+"""Stage 3: Segment detection — multi-signal commercial/show content classifier."""
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from enum import Enum
+
+import numpy as np
+from rich.console import Console
+from rich.table import Table
+
+console = Console()
+
+
+class SegmentType(Enum):
+    SHOW_CONTENT = "show_content"
+    COMMERCIAL = "commercial"
+    SHOW_ELEMENT = "show_element"  # intro, outro, bumper
+    SILENCE = "silence"
+    UNKNOWN = "unknown"
+
+
+@dataclass
+class DetectedSegment:
+    start: float
+    end: float
+    segment_type: SegmentType
+    confidence: float
+    label: str = ""  # "Segment 1: The Week That Was", "Commercial Break 1", etc.
+    signals: dict = None  # Individual signal scores
+
+    def __post_init__(self):
+        if self.signals is None:
+            self.signals = {}
+
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+
+
+@dataclass
+class SegmentDetectionResult:
+    segments: list[DetectedSegment]
+    show_segments: list[DetectedSegment]
+    commercial_segments: list[DetectedSegment]
+    element_segments: list[DetectedSegment]
+    total_show_time: float
+    total_commercial_time: float
+
+    def to_dict(self) -> dict:
+        return {
+            "total_show_time": self.total_show_time,
+            "total_commercial_time": self.total_commercial_time,
+            "segments": [
+                {
+                    "start": s.start,
+                    "end": s.end,
+                    "type": s.segment_type.value,
+                    "confidence": s.confidence,
+                    "label": s.label,
+                    "signals": s.signals,
+                }
+                for s in self.segments
+            ],
+        }
+
+    def save(self, output_dir: Path):
+        output_dir.mkdir(parents=True, exist_ok=True)
+        with open(output_dir / "detection-report.json", "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+
+    def print_summary(self):
+        table = Table(title="Segment Detection Results")
+        table.add_column("Time", style="cyan")
+        table.add_column("Duration", style="magenta")
+        table.add_column("Type", style="green")
+        table.add_column("Confidence", style="yellow")
+        table.add_column("Label")
+
+        for seg in self.segments:
+            start = _format_time(seg.start)
+            dur = f"{seg.duration:.0f}s"
+            type_style = {
+                SegmentType.SHOW_CONTENT: "[green]SHOW[/green]",
+                SegmentType.COMMERCIAL: "[red]COMMERCIAL[/red]",
+                SegmentType.SHOW_ELEMENT: "[blue]ELEMENT[/blue]",
+                SegmentType.SILENCE: "[dim]SILENCE[/dim]",
+                SegmentType.UNKNOWN: "[yellow]UNKNOWN[/yellow]",
+            }.get(seg.segment_type, str(seg.segment_type))
+
+            table.add_row(start, dur, type_style, f"{seg.confidence:.2f}", seg.label)
+
+        console.print(table)
+        console.print(f"\nShow content: {self.total_show_time / 60:.1f} min")
+        console.print(f"Commercials: {self.total_commercial_time / 60:.1f} min")
+
+
+def _format_time(seconds: float) -> str:
+    m = int(seconds // 60)
+    s = int(seconds % 60)
+    return f"{m:02d}:{s:02d}"
+
+
+class SegmentDetector:
+    """Multi-signal commercial/show content detector."""
+
+    def __init__(self, config):
+        self.config = config
+        self.weights = config.segment_detection.weights
+
+    def detect(self, audio_path: Path, transcript=None, diarization=None,
+               show_prep=None) -> SegmentDetectionResult:
+        """Run all detection signals and combine scores."""
+        console.print(f"[bold]Detecting segments:[/bold] {audio_path.name}")
+
+        # Load audio for analysis
+        audio_data, sample_rate = self._load_audio(audio_path)
+        duration = len(audio_data) / sample_rate
+
+        # Step 1: Find candidate boundaries using silence detection
+        boundaries = self._detect_silence_boundaries(audio_data, sample_rate)
+        console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]")
+
+        # Step 2: Create candidate segments between boundaries
+        candidates = self._create_candidate_segments(boundaries, duration)
+
+        # Step 3: Score each candidate with all available signals
+        for candidate in candidates:
+            scores = {}
+
+            # Signal 1: Fingerprint matching (if library available)
+            scores["fingerprint"] = self._score_fingerprint(
+                audio_data, sample_rate, candidate
+            )
+
+            # Signal 2: Speaker identity
+            if diarization:
+                scores["speaker"] = self._score_speaker_identity(
+                    diarization, candidate
+                )
+            else:
+                scores["speaker"] = 0.5  # neutral
+
+            # Signal 3: Audio characteristics
+            scores["audio_chars"] = self._score_audio_characteristics(
+                audio_data, sample_rate, candidate
+            )
+
+            # Signal 4: Structural heuristics
+            if transcript:
+                scores["structural"] = self._score_structural(
+                    transcript, candidate
+                )
+            else:
+                scores["structural"] = 0.5
+
+            # Combined weighted score (higher = more likely commercial)
+            commercial_score = (
+                self.weights.fingerprint_match * scores.get("fingerprint", 0.5) +
+                self.weights.speaker_identity * scores.get("speaker", 0.5) +
+                self.weights.audio_characteristics * scores.get("audio_chars", 0.5) +
+                self.weights.structural_heuristic * scores.get("structural", 0.5)
+            )
+
+            candidate.signals = scores
+            candidate.confidence = commercial_score
+
+            if commercial_score >= self.config.segment_detection.confidence_threshold:
+                candidate.segment_type = SegmentType.COMMERCIAL
+            else:
+                candidate.segment_type = SegmentType.SHOW_CONTENT
+
+        # Step 4: Merge adjacent segments of same type
+        merged = self._merge_adjacent(candidates)
+
+        # Step 5: Apply duration constraints
+        final = self._apply_constraints(merged)
+
+        # Step 6: Label show segments using show prep if available
+        if show_prep:
+            self._label_from_prep(final, transcript, show_prep)
+
+        # Build result
+        show_segs = [s for s in final if s.segment_type == SegmentType.SHOW_CONTENT]
+        comm_segs = [s for s in final if s.segment_type == SegmentType.COMMERCIAL]
+        elem_segs = [s for s in final if s.segment_type == SegmentType.SHOW_ELEMENT]
+
+        result = SegmentDetectionResult(
+            segments=final,
+            show_segments=show_segs,
+            commercial_segments=comm_segs,
+            element_segments=elem_segs,
+            total_show_time=sum(s.duration for s in show_segs),
+            total_commercial_time=sum(s.duration for s in comm_segs),
+        )
+
+        result.print_summary()
+        return result
+
+    def _load_audio(self, audio_path: Path) -> tuple[np.ndarray, int]:
+        """Load audio file as mono numpy array."""
+        import subprocess
+        import io
+        import struct
+
+        # Use ffmpeg to decode to raw PCM
+        result = subprocess.run(
+            ["ffmpeg", "-i", str(audio_path), "-f", "s16le", "-ac", "1",
+             "-ar", "16000", "-"],
+            capture_output=True, timeout=300,
+        )
+        audio = np.frombuffer(result.stdout, dtype=np.int16).astype(np.float32) / 32768.0
+        return audio, 16000
+
+    def _detect_silence_boundaries(self, audio: np.ndarray, sr: int,
+                                   min_silence_ms: int = 500) -> list[float]:
+        """Detect silence gaps in audio that likely indicate segment boundaries."""
+        frame_size = int(sr * 0.025)  # 25ms frames
+        hop_size = int(sr * 0.010)    # 10ms hop
+        threshold_db = self.config.segment_detection.silence_threshold_db
+        threshold_amp = 10 ** (threshold_db / 20)
+        min_silence_frames = int(min_silence_ms / 10)
+
+        # Calculate frame energy
+        energies = []
+        for i in range(0, len(audio) - frame_size, hop_size):
+            frame = audio[i:i + frame_size]
+            rms = np.sqrt(np.mean(frame ** 2))
+            energies.append(rms)
+
+        # Find silence regions
+        is_silent = [e < threshold_amp for e in energies]
+        boundaries = []
+        silent_count = 0
+
+        for i, silent in enumerate(is_silent):
+            if silent:
+                silent_count += 1
+            else:
+                if silent_count >= min_silence_frames:
+                    # Mark the midpoint of the silence as a boundary
+                    mid_frame = i - silent_count // 2
+                    boundary_time = mid_frame * 0.010
+                    boundaries.append(boundary_time)
+                silent_count = 0
+
+        return boundaries
+
+    def _create_candidate_segments(self, boundaries: list[float],
+                                   total_duration: float) -> list[DetectedSegment]:
+        """Create candidate segments from silence boundaries."""
+        candidates = []
+        prev = 0.0
+
+        for boundary in boundaries:
+            if boundary - prev > 1.0:  # Ignore segments < 1 second
+                candidates.append(DetectedSegment(
+                    start=prev,
+                    end=boundary,
+                    segment_type=SegmentType.UNKNOWN,
+                    confidence=0.0,
+                ))
+            prev = boundary
+
+        # Final segment
+        if total_duration - prev > 1.0:
+            candidates.append(DetectedSegment(
+                start=prev,
+                end=total_duration,
+                segment_type=SegmentType.UNKNOWN,
+                confidence=0.0,
+            ))
+
+        return candidates
+
+    def _score_fingerprint(self, audio: np.ndarray, sr: int,
+                           segment: DetectedSegment) -> float:
+        """Score based on audio fingerprint matching against element library.
+        Returns 0.0 (no match / definitely show) to 1.0 (definite commercial boundary).
+        """
+        # TODO: Implement fingerprint matching against element-library/fingerprints.db
+        # For now, return neutral score
+        return 0.5
+
+    def _score_speaker_identity(self, diarization, segment: DetectedSegment) -> float:
+        """Score based on whether the host is speaking.
+        Returns 0.0 (host definitely speaking = show content)
+        to 1.0 (host definitely absent = likely commercial).
+        """
+        host_time = 0.0
+        total_time = segment.duration
+
+        for turn in diarization.turns:
+            if turn.end < segment.start or turn.start > segment.end:
+                continue
+            # Calculate overlap
+            overlap_start = max(turn.start, segment.start)
+            overlap_end = min(turn.end, segment.end)
+            overlap = max(0, overlap_end - overlap_start)
+
+            if "host" in turn.speaker.lower():
+                host_time += overlap
+
+        if total_time == 0:
+            return 0.5
+
+        host_fraction = host_time / total_time
+        # Invert: high host presence = low commercial score
+        return 1.0 - host_fraction
+
+    def _score_audio_characteristics(self, audio: np.ndarray, sr: int,
+                                     segment: DetectedSegment) -> float:
+        """Score based on audio production characteristics.
+        Commercials tend to be louder, more compressed, different spectral profile.
+        Returns 0.0 (matches show characteristics) to 1.0 (matches commercial characteristics).
+        """
+        start_sample = int(segment.start * sr)
+        end_sample = min(int(segment.end * sr), len(audio))
+        seg_audio = audio[start_sample:end_sample]
+
+        if len(seg_audio) < sr:  # Less than 1 second
+            return 0.5
+
+        # RMS energy (commercials tend to be louder)
+        rms = np.sqrt(np.mean(seg_audio ** 2))
+
+        # Dynamic range (commercials tend to be more compressed)
+        frame_size = int(sr * 0.050)  # 50ms frames
+        frame_rms = []
+        for i in range(0, len(seg_audio) - frame_size, frame_size):
+            frame = seg_audio[i:i + frame_size]
+            frame_rms.append(np.sqrt(np.mean(frame ** 2)))
+
+        if not frame_rms:
+            return 0.5
+
+        dynamic_range = max(frame_rms) / (min(frame_rms) + 1e-8)
+
+        # Simple heuristic scoring:
+        # High RMS + low dynamic range = compressed commercial audio
+        score = 0.5
+        if rms > 0.15:  # Louder than typical speech
+            score += 0.15
+        if dynamic_range < 5.0:  # Very compressed
+            score += 0.15
+
+        return min(1.0, max(0.0, score))
+
+    def _score_structural(self, transcript, segment: DetectedSegment) -> float:
+        """Score based on transcript content structural cues.
+        Returns 0.0 (show content cues found) to 1.0 (commercial cues found).
+        """
+        text = transcript.text_at(segment.start, segment.end).lower()
+
+        # Show content indicators
+        show_phrases = [
+            "welcome back", "let's move on", "next up", "our next topic",
+            "let's talk about", "as i mentioned", "the question is",
+            "caller", "what do you think", "here's the thing",
+        ]
+        # Commercial/break indicators
+        break_phrases = [
+            "we'll be right back", "stay tuned", "don't go anywhere",
+            "after the break", "when we come back",
+        ]
+
+        show_hits = sum(1 for p in show_phrases if p in text)
+        break_hits = sum(1 for p in break_phrases if p in text)
+
+        if show_hits > 0 and break_hits == 0:
+            return 0.2  # Likely show content
+        if break_hits > 0:
+            return 0.8  # Likely near a break
+        return 0.5  # Neutral
+
+    def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
+        """Merge adjacent segments of the same type."""
+        if not segments:
+            return []
+
+        merged = [segments[0]]
+        for seg in segments[1:]:
+            prev = merged[-1]
+            if (prev.segment_type == seg.segment_type and
+                    abs(seg.start - prev.end) < 2.0):  # Within 2 seconds
+                # Extend previous segment
+                prev.end = seg.end
+                prev.confidence = (prev.confidence + seg.confidence) / 2
+            else:
+                merged.append(seg)
+
+        return merged
+
+    def _apply_constraints(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
+        """Apply duration constraints — short 'commercial' segments are likely misclassified."""
+        min_break = self.config.segment_detection.min_break_duration_s
+
+        for seg in segments:
+            if (seg.segment_type == SegmentType.COMMERCIAL and
+                    seg.duration < min_break):
+                seg.segment_type = SegmentType.SHOW_CONTENT
+                seg.label = "(reclassified: too short for commercial)"
+
+        return segments
+
+    def _label_from_prep(self, segments: list[DetectedSegment],
+                         transcript, show_prep: str):
+        """Label show segments by matching transcript content to show prep topics."""
+        # TODO: Use Ollama to match transcript sections against show prep segment titles
+        # For now, number them sequentially
+        show_count = 0
+        comm_count = 0
+        for seg in segments:
+            if seg.segment_type == SegmentType.SHOW_CONTENT:
+                show_count += 1
+                seg.label = f"Show Segment {show_count}"
+            elif seg.segment_type == SegmentType.COMMERCIAL:
+                comm_count += 1
+                seg.label = f"Commercial Break {comm_count}"