radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions
--- a/projects/radio-show/audio-processor/src/transcriber.py
+++ b/projects/radio-show/audio-processor/src/transcriber.py
@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:


 def transcribe(audio_path: str | Path, model_size: str = "large-v3",
-               language: str = "en", device: str = "cuda") -> Transcript:
-    """Transcribe an audio file using faster-whisper."""
-    from faster_whisper import WhisperModel
+               language: str = "en", device: str = "cuda",
+               batch_size: int = 16) -> Transcript:
+    """Transcribe an audio file using faster-whisper.
+
+    Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
+    Word timestamps are skipped in batch mode (not needed for segment-level search).
+    Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
+    """
+    from faster_whisper import WhisperModel, BatchedInferencePipeline

    audio_path = Path(audio_path)
+    use_batched = batch_size > 0
+
    console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
-    console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
-
-    model = WhisperModel(model_size, device=device, compute_type="float16")
-
-    segments_raw, info = model.transcribe(
-        str(audio_path),
-        language=language,
-        word_timestamps=True,
-        vad_filter=True,
-        vad_parameters=dict(
-            min_silence_duration_ms=500,
-            speech_pad_ms=200,
-        ),
+    console.print(
+        f"[dim]Model: {model_size} | "
+        f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
+        f"Device: {device}[/dim]"
    )

-    console.print(f"[dim]Detected language: {info.language} "
-                  f"(probability: {info.language_probability:.2f})[/dim]")
-    console.print(f"[dim]Duration: {info.duration:.1f}s "
-                  f"({info.duration / 60:.1f} min)[/dim]")
+    if use_batched:
+        base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
+        model = BatchedInferencePipeline(model=base_model)
+        segments_raw, info = model.transcribe(
+            str(audio_path),
+            language=language,
+            batch_size=batch_size,
+        )
+    else:
+        model = WhisperModel(model_size, device=device, compute_type="float16")
+        segments_raw, info = model.transcribe(
+            str(audio_path),
+            language=language,
+            word_timestamps=True,
+            vad_filter=True,
+            vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
+        )
+
+    console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")

    segments = []
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        BarColumn(),
-        TextColumn("{task.completed} segments"),
-        TimeElapsedColumn(),
-        console=console,
-    ) as progress:
-        task = progress.add_task("Transcribing...", total=None)
-
-        for i, seg in enumerate(segments_raw):
+    for i, seg in enumerate(segments_raw):
+        words = []
+        if not use_batched:
            words = [
-                TranscriptWord(
-                    word=w.word,
-                    start=w.start,
-                    end=w.end,
-                    probability=w.probability,
-                )
+                TranscriptWord(word=w.word, start=w.start,
+                               end=w.end, probability=w.probability)
                for w in (seg.words or [])
            ]
-            segments.append(TranscriptSegment(
-                id=i,
-                text=seg.text,
-                start=seg.start,
-                end=seg.end,
-                words=words,
-            ))
-            progress.update(task, completed=i + 1)
+        segments.append(TranscriptSegment(
+            id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
+        ))
+        if i % 50 == 0:
+            console.print(f"[dim]  {i} segments... ({seg.end:.0f}s)[/dim]")

    console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")