radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike)
- diarizer.py: add CO-HOST speaker label for cohost-role profiles
- voice_profiler.py: emit "Cohost: <name>" label for cohost role
- qa_extractor.py: overlap resolution at load time (midpoint boundary split),
  4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns),
  _preceded_by_caller_intro() helper, _PHONE_GREETING pattern,
  751-1041 + "we'll get your problem solved" promo signatures
- benchmark.py: use src.transcriber.transcribe with batch_size=16
- add index_test_episodes.py and build_cohost_profile.py scripts
- add .gitignore (exclude episodes, transcripts, *.db, .venv)
- session log: 2026-04-27-qa-extraction-cohost-indexing.md

Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs.
archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions

View File

@@ -0,0 +1,25 @@
# Python
__pycache__/
*.pyc
*.pyo
.venv/
*.egg-info/
# Large data files
test-data/episodes/
test-data/transcripts/
episodes/
processed/
# Databases (regenerable)
*.db
*.sqlite
# Model cache
.cache/
*.pt
*.bin
# OS
.DS_Store
Thumbs.db

View File

@@ -57,13 +57,15 @@ trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
import json
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
@@ -74,30 +76,15 @@ for ep in EPISODES:
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
from faster_whisper import WhisperModel
if not hasattr(sys, "_whisper_model"):
console.print(" [dim]Loading Whisper large-v3...[/dim]")
sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16")
model = sys._whisper_model
segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5)
import json
segs = []
for seg in segments_iter:
segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text})
duration = info.duration
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = duration / wall
rtf = transcript.duration / wall
result = {"duration": duration, "language": "en", "segments": segs}
with open(transcript_path, "w") as f:
json.dump(result, f)
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, duration, wall))
trans_total_audio += duration
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:

View File

@@ -0,0 +1,115 @@
"""
Build voice profile for Tom (co-host) from known co-host speech windows.
Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
before any real callers would have called in.
"""
import os, sys
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from pathlib import Path
import json
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.voice_profiler import VoiceProfiler, SpeakerProfile
from rich.console import Console
console = Console()
BASE = Path(__file__).parent
PROFILES_DIR = BASE / "voice-profiles"
EPISODES_DIR = BASE / "test-data" / "episodes"
TRANS_DIR = BASE / "test-data" / "transcripts"
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")
profiler = VoiceProfiler(PROFILES_DIR, device=device)
# Tom's known speech windows per episode
# CALLER turns from diarization that are in the first 60 min (before real callers)
# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
TOM_WINDOWS = {
"2014-s6e19.mp3": [
(195, 260),
(320, 425),
(600, 650),
(675, 710),
],
"2016-s8e43.mp3": [
(100, 115),
(135, 160),
(270, 295),
(575, 605),
(1185, 1235),
(1790, 1870),
(2020, 2055),
],
}
COHOST_NAME = "Tom"
if COHOST_NAME not in profiler.profiles:
profiler.profiles[COHOST_NAME] = SpeakerProfile(
name=COHOST_NAME,
role="cohost",
embeddings=[],
source_episodes=[],
)
profile = profiler.profiles[COHOST_NAME]
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
for ep_name, windows in TOM_WINDOWS.items():
ep_path = EPISODES_DIR / ep_name
if not ep_path.exists():
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
continue
console.print(f"\n Loading {ep_name}...")
audio = profiler._load_full_audio(ep_path)
profiler._get_model()
SAMPLE_RATE = 16000
chunk_s = 10.0
chunk_samples = int(chunk_s * SAMPLE_RATE)
for win_start, win_end in windows:
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
chunk_end = chunk_start + int(chunk_s)
s = int(chunk_start * SAMPLE_RATE)
e = s + chunk_samples
if e > len(audio):
break
try:
emb = profiler._embed_audio_np(audio[s:e])
profile.embeddings.append(emb)
console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]")
except Exception as ex:
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
profile.source_episodes.append(ep_name)
if not profile.embeddings:
console.print("[red]No embeddings collected — check episode paths[/red]")
sys.exit(1)
profile.compute_composite()
console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
f"from {len(profile.source_episodes)} episodes[/green]")
# Verify: check cosine similarity vs Mike to ensure separation
mike = profiler.profiles.get("Mike Swanson")
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
(np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")
profiler.save_profiles()
console.print("[bold green]Profile saved.[/bold green]")

View File

@@ -0,0 +1,102 @@
"""
Index the 6 test episodes into archive.db.
Reads pre-computed transcripts + diarization from test-data/transcripts/.
"""
import os, sys, re
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from pathlib import Path
from src.indexer import ArchiveIndex
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
from rich.console import Console
from rich.table import Table
console = Console()
BASE = Path(__file__).parent
TRANS_DIR = BASE / "test-data" / "transcripts"
EP_DIR = BASE / "test-data" / "episodes"
DB_PATH = BASE / "archive.db"
_DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})")
def parse_episode_meta(ep_id: str) -> tuple[str, int | None]:
"""Return (date_str_or_year, hr) from episode directory name."""
m = _DATE_RE.match(ep_id)
if m:
date = m.group(1)
hr = int(ep_id[-1]) if ep_id.endswith(("-hr1", "-hr2")) else None
return date, hr
# season/episode format e.g. 2016-s8e43 — use year only
year = ep_id[:4]
return year, None
console.print(f"\n[bold]Indexing test episodes into {DB_PATH.name}[/bold]")
with ArchiveIndex(DB_PATH) as idx:
rows = []
for ep_dir in sorted(TRANS_DIR.iterdir()):
t_path = ep_dir / "transcript.json"
d_path = ep_dir / "diarization.json"
if not t_path.exists():
continue
ep_id = ep_dir.name
date, hr = parse_episode_meta(ep_id)
audio_path = EP_DIR / f"{ep_id}.mp3"
# Episode duration from transcript
import json
with open(t_path) as f:
td = json.load(f)
duration = td.get("duration", 0)
# Register episode
idx.add_episode(
episode_id=ep_id,
audio_path=audio_path,
date=date,
duration=duration,
hr=hr,
)
# Load diarized segments and index
segs = load_diarized_transcript(t_path, d_path if d_path.exists() else None)
idx.add_segments(ep_id, segs)
# Extract and index Q&A pairs
pairs = extract_qa_pairs(segs)
for p in pairs:
idx.add_qa_pair(
episode_id=ep_id,
q_start=p.question_start, q_end=p.question_end,
a_start=p.answer_start, a_end=p.answer_end,
question=p.question_text, answer=p.answer_text,
topic=p.topic, tags=p.topic_tags,
)
rows.append((ep_id, date, f"{duration:.0f}s", len(segs), len(pairs)))
console.print(f" [green]{ep_id}[/green]: {len(segs)} segs, {len(pairs)} Q&A pairs")
stats = idx.stats()
table = Table(title="Index Summary")
table.add_column("Episode")
table.add_column("Date")
table.add_column("Duration")
table.add_column("Segments")
table.add_column("Q&A")
for ep_id, date, dur, segs, qa in rows:
table.add_row(ep_id, date, dur, str(segs), str(qa))
console.print()
console.print(table)
console.print(f"\n[bold]DB totals:[/bold] {stats['episodes']} episodes, "
f"{stats['segments']} segments, {stats['qa_pairs']} Q&A pairs")
console.print(f"[dim]DB path: {DB_PATH}[/dim]")

View File

@@ -202,6 +202,8 @@ def diarize(audio_path: str | Path,
label = seg.speaker_label.split(" (")[0] # strip confidence score
if label.startswith("Host:") or label.startswith("Host "):
speaker = "HOST"
elif label.startswith("Cohost:"):
speaker = "CO-HOST"
elif label == "[error]":
speaker = "UNKNOWN"
else:

View File

@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
(re.compile(r"\bcomputer running slow\b", re.I), 1),
(re.compile(r"\bafter these messages\b", re.I), 1),
(re.compile(r"\b790.?2040\b", re.I), 1),
(re.compile(r"\b751.?1041\b", re.I), 1),
(re.compile(r"\bgurushow\.com\b", re.I), 1),
(re.compile(r"\bcall in now\b", re.I), 1),
(re.compile(r"\bcomputer troubles\?", re.I), 1),
(re.compile(r"\bhardware installation\b", re.I), 1),
(re.compile(r"we.?ll get your problem solved", re.I), 1),
]
@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
if _is_promo_or_bumper(turn["text"]):
i += 1
continue
# Skip the opening 90s — real callers never call before the show starts
if turn["start"] < 90:
i += 1
continue
q_duration = turn["end"] - turn["start"]
if q_duration < MIN_QUESTION_DURATION:
i += 1
continue
# Require caller-intro context: host must have introduced the call, OR
# the caller opens with a phone greeting ("hello", "hi", "hey")
if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
i += 1
continue
# Look ahead for HOST answer turn(s)
j = i + 1
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
with open(diarization_path) as f:
diarization = json.load(f)
turns = diarization.get("turns", [])
raw_turns = diarization.get("turns", [])
def speaker_at(t: float) -> str:
"""Find which diarization turn covers time t."""
# Resolve overlapping boundaries left by the sliding-window diarizer:
# place each transition at the midpoint of the overlap region.
resolved: list[dict] = []
for turn in sorted(raw_turns, key=lambda t: t["start"]):
if not resolved:
resolved.append(dict(turn))
continue
prev = resolved[-1]
if turn["start"] < prev["end"]:
mid = (turn["start"] + prev["end"]) / 2
prev["end"] = mid
resolved.append({**turn, "start": mid})
else:
resolved.append(dict(turn))
turns = resolved
# Minimum CALLER coverage to label a transcript segment as CALLER.
# Batch transcription produces ~25s segments; caller windows are 10s.
# Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
_CALLER_MIN_S = 4.0
def speaker_for_segment(seg_start: float, seg_end: float) -> str:
caller_cov = 0.0
coverage: dict[str, float] = {}
for turn in turns:
if turn["start"] <= t <= turn["end"]:
return turn["speaker"]
return "UNKNOWN"
overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
if overlap <= 0:
continue
coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
if turn["speaker"] == "CALLER":
caller_cov += overlap
if not coverage:
return "UNKNOWN"
if caller_cov >= _CALLER_MIN_S:
return "CALLER"
return max(coverage, key=coverage.__getitem__)
return [
{"start": s["start"], "end": s["end"],
"text": s["text"],
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
"speaker": speaker_for_segment(s["start"], s["end"])}
for s in segments
]
# ── Helpers ────────────────────────────────────────────────────────────────
_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
"""Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
host_count = 0
for j in range(idx - 1, -1, -1):
if turns[j]["speaker"] == "HOST":
if _CALLER_INTRO.search(turns[j]["text"]):
return True
host_count += 1
if host_count >= max_host_turns:
break
return False
def _looks_like_question(text: str) -> bool:
return bool(QUESTION_PATTERN.search(text))

View File

@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:
def transcribe(audio_path: str | Path, model_size: str = "large-v3",
language: str = "en", device: str = "cuda") -> Transcript:
"""Transcribe an audio file using faster-whisper."""
from faster_whisper import WhisperModel
language: str = "en", device: str = "cuda",
batch_size: int = 16) -> Transcript:
"""Transcribe an audio file using faster-whisper.
Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
Word timestamps are skipped in batch mode (not needed for segment-level search).
Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
"""
from faster_whisper import WhisperModel, BatchedInferencePipeline
audio_path = Path(audio_path)
use_batched = batch_size > 0
console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
model = WhisperModel(model_size, device=device, compute_type="float16")
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=200,
),
console.print(
f"[dim]Model: {model_size} | "
f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
f"Device: {device}[/dim]"
)
console.print(f"[dim]Detected language: {info.language} "
f"(probability: {info.language_probability:.2f})[/dim]")
console.print(f"[dim]Duration: {info.duration:.1f}s "
f"({info.duration / 60:.1f} min)[/dim]")
if use_batched:
base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
model = BatchedInferencePipeline(model=base_model)
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
batch_size=batch_size,
)
else:
model = WhisperModel(model_size, device=device, compute_type="float16")
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")
segments = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed} segments"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Transcribing...", total=None)
for i, seg in enumerate(segments_raw):
for i, seg in enumerate(segments_raw):
words = []
if not use_batched:
words = [
TranscriptWord(
word=w.word,
start=w.start,
end=w.end,
probability=w.probability,
)
TranscriptWord(word=w.word, start=w.start,
end=w.end, probability=w.probability)
for w in (seg.words or [])
]
segments.append(TranscriptSegment(
id=i,
text=seg.text,
start=seg.start,
end=seg.end,
words=words,
))
progress.update(task, completed=i + 1)
segments.append(TranscriptSegment(
id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
))
if i % 50 == 0:
console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]")
console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")

View File

@@ -319,8 +319,11 @@ class VoiceProfiler:
best_match = name
if best_score >= threshold:
if best_match and self.profiles[best_match].role == "host":
role = self.profiles[best_match].role if best_match else "unknown"
if role == "host":
label = f"Host: {best_match}"
elif role == "cohost":
label = f"Cohost: {best_match}"
else:
label = best_match
else:

View File

@@ -1,26 +1,34 @@
{
"Mike Swanson": {
"role": "host",
"num_samples": 180,
"source_episodes": [
"2010-10-02-hr1.mp3",
"2011-06-04-hr1.mp3",
"2011-09-10-hr1.mp3",
"2014-s6e05.mp3",
"2015-s7e30.mp3",
"2016-s8e42.mp3",
"2017-s9e26.mp3",
"2018-s10e17.mp3",
"2018-s10e21.mp3",
"2010-10-02-hr1.mp3",
"2011-06-04-hr1.mp3",
"2011-09-10-hr1.mp3",
"2014-s6e05.mp3",
"2015-s7e30.mp3",
"2016-s8e42.mp3",
"2017-s9e26.mp3",
"2018-s10e17.mp3",
"2018-s10e21.mp3"
]
}
{
"Mike Swanson": {
"role": "host",
"num_samples": 180,
"source_episodes": [
"2010-10-02-hr1.mp3",
"2011-06-04-hr1.mp3",
"2011-09-10-hr1.mp3",
"2014-s6e05.mp3",
"2015-s7e30.mp3",
"2016-s8e42.mp3",
"2017-s9e26.mp3",
"2018-s10e17.mp3",
"2018-s10e21.mp3",
"2010-10-02-hr1.mp3",
"2011-06-04-hr1.mp3",
"2011-09-10-hr1.mp3",
"2014-s6e05.mp3",
"2015-s7e30.mp3",
"2016-s8e42.mp3",
"2017-s9e26.mp3",
"2018-s10e17.mp3",
"2018-s10e21.mp3"
]
},
"Tom": {
"role": "cohost",
"num_samples": 44,
"source_episodes": [
"2014-s6e19.mp3",
"2016-s8e43.mp3"
]
}
}