radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike)
- diarizer.py: add CO-HOST speaker label for cohost-role profiles
- voice_profiler.py: emit "Cohost: <name>" label for cohost role
- qa_extractor.py: overlap resolution at load time (midpoint boundary split),
  4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns),
  _preceded_by_caller_intro() helper, _PHONE_GREETING pattern,
  751-1041 + "we'll get your problem solved" promo signatures
- benchmark.py: use src.transcriber.transcribe with batch_size=16
- add index_test_episodes.py and build_cohost_profile.py scripts
- add .gitignore (exclude episodes, transcripts, *.db, .venv)
- session log: 2026-04-27-qa-extraction-cohost-indexing.md

Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs.
archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions

View File

@@ -0,0 +1,25 @@
# Python
__pycache__/
*.pyc
*.pyo
.venv/
*.egg-info/
# Large data files
test-data/episodes/
test-data/transcripts/
episodes/
processed/
# Databases (regenerable)
*.db
*.sqlite
# Model cache
.cache/
*.pt
*.bin
# OS
.DS_Store
Thumbs.db

View File

@@ -57,13 +57,15 @@ trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
import json
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
@@ -74,30 +76,15 @@ for ep in EPISODES:
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
from faster_whisper import WhisperModel
if not hasattr(sys, "_whisper_model"):
console.print(" [dim]Loading Whisper large-v3...[/dim]")
sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16")
model = sys._whisper_model
segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5)
import json
segs = []
for seg in segments_iter:
segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text})
duration = info.duration
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = duration / wall
rtf = transcript.duration / wall
result = {"duration": duration, "language": "en", "segments": segs}
with open(transcript_path, "w") as f:
json.dump(result, f)
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, duration, wall))
trans_total_audio += duration
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:

View File

@@ -0,0 +1,115 @@
"""
Build voice profile for Tom (co-host) from known co-host speech windows.
Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
before any real callers would have called in.
"""
import os, sys
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from pathlib import Path
import json
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.voice_profiler import VoiceProfiler, SpeakerProfile
from rich.console import Console
console = Console()
BASE = Path(__file__).parent
PROFILES_DIR = BASE / "voice-profiles"
EPISODES_DIR = BASE / "test-data" / "episodes"
TRANS_DIR = BASE / "test-data" / "transcripts"
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")
profiler = VoiceProfiler(PROFILES_DIR, device=device)
# Tom's known speech windows per episode
# CALLER turns from diarization that are in the first 60 min (before real callers)
# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
TOM_WINDOWS = {
"2014-s6e19.mp3": [
(195, 260),
(320, 425),
(600, 650),
(675, 710),
],
"2016-s8e43.mp3": [
(100, 115),
(135, 160),
(270, 295),
(575, 605),
(1185, 1235),
(1790, 1870),
(2020, 2055),
],
}
COHOST_NAME = "Tom"
if COHOST_NAME not in profiler.profiles:
profiler.profiles[COHOST_NAME] = SpeakerProfile(
name=COHOST_NAME,
role="cohost",
embeddings=[],
source_episodes=[],
)
profile = profiler.profiles[COHOST_NAME]
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
for ep_name, windows in TOM_WINDOWS.items():
ep_path = EPISODES_DIR / ep_name
if not ep_path.exists():
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
continue
console.print(f"\n Loading {ep_name}...")
audio = profiler._load_full_audio(ep_path)
profiler._get_model()
SAMPLE_RATE = 16000
chunk_s = 10.0
chunk_samples = int(chunk_s * SAMPLE_RATE)
for win_start, win_end in windows:
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
chunk_end = chunk_start + int(chunk_s)
s = int(chunk_start * SAMPLE_RATE)
e = s + chunk_samples
if e > len(audio):
break
try:
emb = profiler._embed_audio_np(audio[s:e])
profile.embeddings.append(emb)
console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]")
except Exception as ex:
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
profile.source_episodes.append(ep_name)
if not profile.embeddings:
console.print("[red]No embeddings collected — check episode paths[/red]")
sys.exit(1)
profile.compute_composite()
console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
f"from {len(profile.source_episodes)} episodes[/green]")
# Verify: check cosine similarity vs Mike to ensure separation
mike = profiler.profiles.get("Mike Swanson")
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
(np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")
profiler.save_profiles()
console.print("[bold green]Profile saved.[/bold green]")

View File

@@ -0,0 +1,102 @@
"""
Index the 6 test episodes into archive.db.
Reads pre-computed transcripts + diarization from test-data/transcripts/.
"""
import os, sys, re
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from pathlib import Path
from src.indexer import ArchiveIndex
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
from rich.console import Console
from rich.table import Table
console = Console()
BASE = Path(__file__).parent
TRANS_DIR = BASE / "test-data" / "transcripts"
EP_DIR = BASE / "test-data" / "episodes"
DB_PATH = BASE / "archive.db"
_DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})")
def parse_episode_meta(ep_id: str) -> tuple[str, int | None]:
"""Return (date_str_or_year, hr) from episode directory name."""
m = _DATE_RE.match(ep_id)
if m:
date = m.group(1)
hr = int(ep_id[-1]) if ep_id.endswith(("-hr1", "-hr2")) else None
return date, hr
# season/episode format e.g. 2016-s8e43 — use year only
year = ep_id[:4]
return year, None
console.print(f"\n[bold]Indexing test episodes into {DB_PATH.name}[/bold]")
with ArchiveIndex(DB_PATH) as idx:
rows = []
for ep_dir in sorted(TRANS_DIR.iterdir()):
t_path = ep_dir / "transcript.json"
d_path = ep_dir / "diarization.json"
if not t_path.exists():
continue
ep_id = ep_dir.name
date, hr = parse_episode_meta(ep_id)
audio_path = EP_DIR / f"{ep_id}.mp3"
# Episode duration from transcript
import json
with open(t_path) as f:
td = json.load(f)
duration = td.get("duration", 0)
# Register episode
idx.add_episode(
episode_id=ep_id,
audio_path=audio_path,
date=date,
duration=duration,
hr=hr,
)
# Load diarized segments and index
segs = load_diarized_transcript(t_path, d_path if d_path.exists() else None)
idx.add_segments(ep_id, segs)
# Extract and index Q&A pairs
pairs = extract_qa_pairs(segs)
for p in pairs:
idx.add_qa_pair(
episode_id=ep_id,
q_start=p.question_start, q_end=p.question_end,
a_start=p.answer_start, a_end=p.answer_end,
question=p.question_text, answer=p.answer_text,
topic=p.topic, tags=p.topic_tags,
)
rows.append((ep_id, date, f"{duration:.0f}s", len(segs), len(pairs)))
console.print(f" [green]{ep_id}[/green]: {len(segs)} segs, {len(pairs)} Q&A pairs")
stats = idx.stats()
table = Table(title="Index Summary")
table.add_column("Episode")
table.add_column("Date")
table.add_column("Duration")
table.add_column("Segments")
table.add_column("Q&A")
for ep_id, date, dur, segs, qa in rows:
table.add_row(ep_id, date, dur, str(segs), str(qa))
console.print()
console.print(table)
console.print(f"\n[bold]DB totals:[/bold] {stats['episodes']} episodes, "
f"{stats['segments']} segments, {stats['qa_pairs']} Q&A pairs")
console.print(f"[dim]DB path: {DB_PATH}[/dim]")

View File

@@ -202,6 +202,8 @@ def diarize(audio_path: str | Path,
label = seg.speaker_label.split(" (")[0] # strip confidence score
if label.startswith("Host:") or label.startswith("Host "):
speaker = "HOST"
elif label.startswith("Cohost:"):
speaker = "CO-HOST"
elif label == "[error]":
speaker = "UNKNOWN"
else:

View File

@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
(re.compile(r"\bcomputer running slow\b", re.I), 1),
(re.compile(r"\bafter these messages\b", re.I), 1),
(re.compile(r"\b790.?2040\b", re.I), 1),
(re.compile(r"\b751.?1041\b", re.I), 1),
(re.compile(r"\bgurushow\.com\b", re.I), 1),
(re.compile(r"\bcall in now\b", re.I), 1),
(re.compile(r"\bcomputer troubles\?", re.I), 1),
(re.compile(r"\bhardware installation\b", re.I), 1),
(re.compile(r"we.?ll get your problem solved", re.I), 1),
]
@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
if _is_promo_or_bumper(turn["text"]):
i += 1
continue
# Skip the opening 90s — real callers never call before the show starts
if turn["start"] < 90:
i += 1
continue
q_duration = turn["end"] - turn["start"]
if q_duration < MIN_QUESTION_DURATION:
i += 1
continue
# Require caller-intro context: host must have introduced the call, OR
# the caller opens with a phone greeting ("hello", "hi", "hey")
if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
i += 1
continue
# Look ahead for HOST answer turn(s)
j = i + 1
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
with open(diarization_path) as f:
diarization = json.load(f)
turns = diarization.get("turns", [])
raw_turns = diarization.get("turns", [])
def speaker_at(t: float) -> str:
"""Find which diarization turn covers time t."""
# Resolve overlapping boundaries left by the sliding-window diarizer:
# place each transition at the midpoint of the overlap region.
resolved: list[dict] = []
for turn in sorted(raw_turns, key=lambda t: t["start"]):
if not resolved:
resolved.append(dict(turn))
continue
prev = resolved[-1]
if turn["start"] < prev["end"]:
mid = (turn["start"] + prev["end"]) / 2
prev["end"] = mid
resolved.append({**turn, "start": mid})
else:
resolved.append(dict(turn))
turns = resolved
# Minimum CALLER coverage to label a transcript segment as CALLER.
# Batch transcription produces ~25s segments; caller windows are 10s.
# Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
_CALLER_MIN_S = 4.0
def speaker_for_segment(seg_start: float, seg_end: float) -> str:
caller_cov = 0.0
coverage: dict[str, float] = {}
for turn in turns:
if turn["start"] <= t <= turn["end"]:
return turn["speaker"]
overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
if overlap <= 0:
continue
coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
if turn["speaker"] == "CALLER":
caller_cov += overlap
if not coverage:
return "UNKNOWN"
if caller_cov >= _CALLER_MIN_S:
return "CALLER"
return max(coverage, key=coverage.__getitem__)
return [
{"start": s["start"], "end": s["end"],
"text": s["text"],
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
"speaker": speaker_for_segment(s["start"], s["end"])}
for s in segments
]
# ── Helpers ────────────────────────────────────────────────────────────────
_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
"""Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
host_count = 0
for j in range(idx - 1, -1, -1):
if turns[j]["speaker"] == "HOST":
if _CALLER_INTRO.search(turns[j]["text"]):
return True
host_count += 1
if host_count >= max_host_turns:
break
return False
def _looks_like_question(text: str) -> bool:
return bool(QUESTION_PATTERN.search(text))

View File

@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:
def transcribe(audio_path: str | Path, model_size: str = "large-v3",
language: str = "en", device: str = "cuda") -> Transcript:
"""Transcribe an audio file using faster-whisper."""
from faster_whisper import WhisperModel
language: str = "en", device: str = "cuda",
batch_size: int = 16) -> Transcript:
"""Transcribe an audio file using faster-whisper.
Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
Word timestamps are skipped in batch mode (not needed for segment-level search).
Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
"""
from faster_whisper import WhisperModel, BatchedInferencePipeline
audio_path = Path(audio_path)
use_batched = batch_size > 0
console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
console.print(
f"[dim]Model: {model_size} | "
f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
f"Device: {device}[/dim]"
)
if use_batched:
base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
model = BatchedInferencePipeline(model=base_model)
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
batch_size=batch_size,
)
else:
model = WhisperModel(model_size, device=device, compute_type="float16")
segments_raw, info = model.transcribe(
str(audio_path),
language=language,
word_timestamps=True,
vad_filter=True,
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=200,
),
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
console.print(f"[dim]Detected language: {info.language} "
f"(probability: {info.language_probability:.2f})[/dim]")
console.print(f"[dim]Duration: {info.duration:.1f}s "
f"({info.duration / 60:.1f} min)[/dim]")
console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")
segments = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed} segments"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Transcribing...", total=None)
for i, seg in enumerate(segments_raw):
words = []
if not use_batched:
words = [
TranscriptWord(
word=w.word,
start=w.start,
end=w.end,
probability=w.probability,
)
TranscriptWord(word=w.word, start=w.start,
end=w.end, probability=w.probability)
for w in (seg.words or [])
]
segments.append(TranscriptSegment(
id=i,
text=seg.text,
start=seg.start,
end=seg.end,
words=words,
id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
))
progress.update(task, completed=i + 1)
if i % 50 == 0:
console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]")
console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")

View File

@@ -319,8 +319,11 @@ class VoiceProfiler:
best_match = name
if best_score >= threshold:
if best_match and self.profiles[best_match].role == "host":
role = self.profiles[best_match].role if best_match else "unknown"
if role == "host":
label = f"Host: {best_match}"
elif role == "cohost":
label = f"Cohost: {best_match}"
else:
label = best_match
else:

View File

@@ -22,5 +22,13 @@
"2018-s10e17.mp3",
"2018-s10e21.mp3"
]
},
"Tom": {
"role": "cohost",
"num_samples": 44,
"source_episodes": [
"2014-s6e19.mp3",
"2016-s8e43.mp3"
]
}
}

View File

@@ -0,0 +1,251 @@
# Session Log: Q&A Extraction — Co-Host Profile + Archive Indexing
**Date:** 2026-04-27
**Project:** Radio Show Archive Mining — Computer Guru Show
---
## User
- **User:** Mike Swanson (mike)
- **Machine:** DESKTOP-0O8A1RL
- **Role:** admin
---
## Session Summary
The session began with resuming work following a benchmark run that demonstrated a significant performance improvement in Whisper transcription, achieving 63.8x real-time speed with batched inference and int8_float16 settings. Next, the focus shifted to evaluating the quality of Q&A extraction across six test episodes, revealing a critical issue with false positives due to co-host Tom being mislabeled as CALLER based on a voice similarity threshold.
A co-host voice profile for Tom was constructed using 44 embeddings from two specific episodes (2014-s6e19 and 2016-s8e43), producing a cosine similarity of 0.698 against Mike — well below Mike's 0.85 threshold, giving clean separation. Code was updated in `voice_profiler.py` and `diarizer.py` to correctly emit "Cohost: Tom" labels and map them to a new "CO-HOST" speaker tag. Re-diarizing the two co-host-era episodes dramatically cleaned up Q&A results: 2016 went from 12 false positives to 2 real WiFi caller pairs.
Several bugs in `qa_extractor.py` were fixed: overlap resolution for sliding-window diarization boundaries, CALLER-preference threshold for long batch transcript segments, and a turn-based caller-intro lookback to replace an ineffective 120s time window. Phone-greeting detection and new promo signatures were added. The final Q&A count landed at 10 pairs across 6 episodes, with 2014 correctly yielding 0 (gaming co-host episode with no actual callers).
`archive.db` was created with the ArchiveIndex schema (episodes, segments, segments_fts, qa_pairs, qa_fts). All 6 test episodes were indexed: 762 segments, 10 Q&A pairs. FTS5 search verified working for "router", "Windows 10", "Internet Explorer", "antivirus", and "connect" queries.
---
## Key Decisions
- **Co-host threshold uses same 0.85 bar as host**: Tom scores 0.698 vs Mike. Any voice >= 0.85 against Tom's composite gets labeled CO-HOST. Keeps the same single threshold for all profiles rather than per-profile thresholds.
- **Turn-based lookback for caller-intro (2 HOST turns, not 120s)**: Long HOST monologue blocks (8-10 min) in big show segments meant time-based lookback missed the caller introduction. Previous 2 HOST turns always catches it regardless of block length.
- **CALLER-preference at 4s minimum overlap**: Batch transcription produces ~26s segments; diarization CALLER windows are ~10s. Pure majority-vote always gave HOST. 4s minimum CALLER coverage labels the segment CALLER without being overly aggressive for co-host episodes.
- **Midpoint boundary resolution at load time**: Rather than re-diarizing everything, the sliding-window overlap is resolved in `load_diarized_transcript()` so it applies retroactively to all saved diarization files without touching the JSON.
- **751-1041 added as promo signal**: Earlier Tucson show number (vs 790-2040 in later seasons). Weighted 1 (needs a second semi-generic signal to filter).
- **Tom's windows sourced from first 60 min of co-host episodes**: Real callers don't call in during the first hour of a 2-hour show (only exceptions: very end of show). First-hour CALLER windows are safely all Tom.
---
## Problems Encountered
- **2016-s8e43 had 12 Q&A pairs, 11 false positives**: Root cause was Tom (co-host) labeled CALLER throughout. Fixed by building Tom's voice profile and re-diarizing.
- **2014-s6e19 had 2 Q&A pairs from gaming discussion**: Same co-host issue. After re-diarization: 0 pairs (correct — no actual callers in that gaming special).
- **2012-03-10 yielded 0 segments labeled CALLER**: Midpoint assignment hit HOST turns (HOST 0-20s and CALLER 15-30s — midpoint 15.1s falls in HOST). Fixed by overlap-preference assignment with 4s CALLER minimum.
- **Real WiFi caller (2016, ~4794s) was missing after first fix attempt**: Aggressive time-based lookback (120s) combined with short CALLER turns from sliding-window diarization caused the caller question to land in a HOST segment. Fixed by turn-based lookback + co-host profile (eliminated Tom noise, letting real caller windows survive).
- **2012-Jun pair at 1325s was a promo**: "The Computer Guru. We'll get your problem solved. Call 751-1041 today" passed promo filter. Fixed by adding 751-1041 and "we'll get your problem solved" as promo signatures.
---
## Files Created / Modified
### New files
```
projects/radio-show/audio-processor/build_cohost_profile.py
projects/radio-show/audio-processor/index_test_episodes.py
projects/radio-show/audio-processor/archive.db
projects/radio-show/audio-processor/voice-profiles/tom/
projects/radio-show/audio-processor/voice-profiles/profiles.json (updated: Tom added)
projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md (this file)
```
### Modified
```
src/voice_profiler.py — emit "Cohost: <name>" label for cohost role
src/diarizer.py — map "Cohost:" prefix to "CO-HOST" speaker
src/qa_extractor.py — overlap resolution, CALLER-preference, turn-based
caller-intro lookback, _preceded_by_caller_intro(),
_PHONE_GREETING, 751-1041 + promo sig additions
test-data/transcripts/2014-s6e19/diarization.json (re-diarized with Tom profile)
test-data/transcripts/2016-s8e43/diarization.json (re-diarized with Tom profile)
```
---
## Benchmark Results (from previous run — baseline for BEAST comparison)
**Machine:** DESKTOP-0O8A1RL — NVIDIA GeForce RTX 5070 Ti Laptop GPU
| Episode | Audio | Wall (diarize) | RTF |
|---------|-------|----------------|-----|
| 2011-03-12-hr1 | 2509s | 15.1s | 166.1x |
| 2012-03-10-hr1 | 2634s | 12.2s | 215.5x |
| 2012-06-09-hr1 | 2648s | 12.2s | 216.8x |
| 2014-s6e19 | 2914s | 13.4s | 216.9x |
| 2016-s8e43 | 5326s | 24.2s | 219.6x |
| 2017-s9e30 | 5343s | 24.7s | 216.4x |
| **TOTAL** | **21374s** | **101.9s** | **209.7x** |
Transcription (batched Whisper large-v3): 63.8x realtime
Diarization: 209.7x realtime
vs DESKTOP-0O8A1RL baseline (149.5x): **+60.2x (+40.3%)**
---
## Archive DB State
**Path:** `projects/radio-show/audio-processor/archive.db`
```
Episodes : 6
Segments : 762
Q&A pairs: 10
```
**Q&A pairs by episode:**
| Episode | Pairs | Notes |
|---------|-------|-------|
| 2011-03-12-hr1 | 3 | IE lockout call, cloud computing, ghost hunting caller |
| 2012-03-10-hr1 | 1 | iPad 3 discussion |
| 2012-06-09-hr1 | 1 | Windows repair feature call |
| 2014-s6e19 | 0 | Gaming co-host special — no actual callers |
| 2016-s8e43 | 2 | WiFi connectivity caller (2 turns of same call) |
| 2017-s9e30 | 3 | Software control, Cat5 cabling (Charlie), WiFi ports |
---
## Voice Profiles State
**Path:** `projects/radio-show/audio-processor/voice-profiles/`
| Name | Role | Embeddings | Source Episodes |
|------|------|-----------|-----------------|
| Mike Swanson | host | 180 | 9 episodes (2010-2018) |
| Tom | cohost | 44 | 2014-s6e19, 2016-s8e43 |
Tom vs Mike cosine similarity: **0.698** (well-separated at 0.85 threshold)
**Tom's source windows used:**
- 2014-s6e19: 195-260s, 320-425s, 600-650s, 675-710s
- 2016-s8e43: 100-115s, 135-160s, 270-295s, 575-605s, 1185-1235s, 1790-1870s, 2020-2055s
---
## Co-Host Era Notes
Tom was the regular in-studio co-host/board-op roughly 2013-2016. His voice is in episodes from at least 2014 through 2016 (confirmed from test set). The 2011 and 2012 episodes are pure call-in format with no co-host.
If there are occasional guest co-hosts or fill-in hosts in other years, they would still be labeled CALLER until profiled. These would be rare and would likely not form question patterns that survive the caller-intro gate.
---
## Pending Tasks for BEAST (GURU-BEAST-ROG)
### 1. Run benchmark.py to establish RTX 4090 baseline
```bash
cd D:/claudetools/projects/radio-show/audio-processor
.venv/Scripts/python benchmark.py 2>&1 | tee bench-4090.txt
```
BENCH_SETUP.md has all setup steps. The voice profiles are in `voice-profiles/` (already copied or available via Tailscale/robocopy from DESKTOP-0O8A1RL). Test episodes go in `test-data/episodes/`.
Expected: diarization RTF should be ~250-300x on RTX 4090 (vs 209.7x on laptop 5070 Ti). Transcription should be ~70-80x.
Update `benchmark.py` line 27 after measuring:
```python
BASELINE_RTF = 209.7 # current laptop 5070 Ti baseline
```
### 2. Download full archive from IX server (172.16.3.10)
Use paramiko (SSH with key agent disabled):
```python
import paramiko
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect("172.16.3.10", username="gurushow", password="<from vault>",
look_for_keys=False, allow_agent=False)
```
Archive path: `/home/gurushow/public_html/archive/Radio/`
Episode count: 579 MP3s across 2010-2018 (no 2013 season)
Approximate total size: ~30-40 GB
Download script skeleton in prior session log: `2026-04-27-diarization-pipeline.md`
**Tailscale required** — IX server is at 172.16.3.10, requires VPN.
### 3. Full archive processing
Once episodes are downloaded:
```bash
# Transcribe + diarize all episodes
cd D:/claudetools/projects/radio-show/audio-processor
.venv/Scripts/python diarize_training.py # or a new batch_process_all.py
# Index everything into archive.db
.venv/Scripts/python index_test_episodes.py # modify to point at full episodes dir
```
The pipeline is idempotent — `add_segments()` skips episodes already indexed.
### 4. Verify co-host era episodes
2013-2016 era episodes should now correctly separate Tom (CO-HOST) from actual callers. Spot-check a few 2015 episodes after processing to confirm Tom's profile generalizes well.
If any 2015/2016 episodes show too many CALLER turns that are clearly Tom (voice changed slightly over years), re-run `build_cohost_profile.py` with windows from that episode added to TOM_WINDOWS dict.
---
## Technical Reference
### Key thresholds
```python
host_match_threshold = 0.85 # WavLM cosine similarity — applied to ALL profiles
CALLER_MIN_S = 4.0 # min CALLER coverage in transcript segment to label CALLER
PROMO_SCORE_THRESHOLD = 2 # weighted promo signature score
MIN_QUESTION_DURATION = 5.0 # seconds
MIN_ANSWER_DURATION = 15.0 # seconds
MAX_GAP_BETWEEN_QA = 30.0 # seconds
```
### Diarization sliding window
```python
window_s = 10.0 # 10s embedding windows
hop_s = 5.0 # 5s hop → overlapping boundaries (resolved at load time)
```
### Transcription (batch mode)
```python
model_size = "large-v3"
compute_type = "int8_float16"
batch_size = 16
# No word timestamps in batch mode (not needed for search/diarization)
```
### DB search examples
```python
from src.indexer import ArchiveIndex
from pathlib import Path
with ArchiveIndex(Path("archive.db")) as idx:
# Segment search
results = idx.search("router", limit=20)
results = idx.search("Windows 10", speaker_filter="HOST", limit=10)
# Q&A search
qa = idx.search_qa("antivirus", limit=10)
qa = idx.search_qa("wifi connect", limit=10)
```
### Archive server
```
Host: 172.16.3.10 (requires Tailscale)
User: gurushow
Archive root: /home/gurushow/public_html/archive/Radio/
SSH: paramiko with look_for_keys=False, allow_agent=False
```