radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike)
- diarizer.py: add CO-HOST speaker label for cohost-role profiles
- voice_profiler.py: emit "Cohost: <name>" label for cohost role
- qa_extractor.py: overlap resolution at load time (midpoint boundary split),
  4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns),
  _preceded_by_caller_intro() helper, _PHONE_GREETING pattern,
  751-1041 + "we'll get your problem solved" promo signatures
- benchmark.py: use src.transcriber.transcribe with batch_size=16
- add index_test_episodes.py and build_cohost_profile.py scripts
- add .gitignore (exclude episodes, transcripts, *.db, .venv)
- session log: 2026-04-27-qa-extraction-cohost-indexing.md

Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs.
archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions

View File

@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:
def transcribe(audio_path: str | Path, model_size: str = "large-v3",
language: str = "en", device: str = "cuda") -> Transcript:
"""Transcribe an audio file using faster-whisper."""
from faster_whisper import WhisperModel
language: str = "en", device: str = "cuda",
batch_size: int = 16) -> Transcript:
"""Transcribe an audio file using faster-whisper.
Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
Word timestamps are skipped in batch mode (not needed for segment-level search).
Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
"""
from faster_whisper import WhisperModel, BatchedInferencePipeline
audio_path = Path(audio_path)
use_batched = batch_size > 0
console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
model = WhisperModel(model_size, device=device, compute_type="float16")
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=200,
),
console.print(
f"[dim]Model: {model_size} | "
f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
f"Device: {device}[/dim]"
)
console.print(f"[dim]Detected language: {info.language} "
f"(probability: {info.language_probability:.2f})[/dim]")
console.print(f"[dim]Duration: {info.duration:.1f}s "
f"({info.duration / 60:.1f} min)[/dim]")
if use_batched:
base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
model = BatchedInferencePipeline(model=base_model)
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
batch_size=batch_size,
)
else:
model = WhisperModel(model_size, device=device, compute_type="float16")
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")
segments = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed} segments"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Transcribing...", total=None)
for i, seg in enumerate(segments_raw):
for i, seg in enumerate(segments_raw):
words = []
if not use_batched:
words = [
TranscriptWord(
word=w.word,
start=w.start,
end=w.end,
probability=w.probability,
)
TranscriptWord(word=w.word, start=w.start,
end=w.end, probability=w.probability)
for w in (seg.words or [])
]
segments.append(TranscriptSegment(
id=i,
text=seg.text,
start=seg.start,
end=seg.end,
words=words,
))
progress.update(task, completed=i + 1)
segments.append(TranscriptSegment(
id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
))
if i % 50 == 0:
console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]")
console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")