radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions
--- a/projects/radio-show/audio-processor/.gitignore
+++ b/projects/radio-show/audio-processor/.gitignore
@@ -0,0 +1,25 @@
+# Python
+__pycache__/
+*.pyc
+*.pyo
+.venv/
+*.egg-info/
+
+# Large data files
+test-data/episodes/
+test-data/transcripts/
+episodes/
+processed/
+
+# Databases (regenerable)
+*.db
+*.sqlite
+
+# Model cache
+.cache/
+*.pt
+*.bin
+
+# OS
+.DS_Store
+Thumbs.db
--- a/projects/radio-show/audio-processor/benchmark.py
+++ b/projects/radio-show/audio-processor/benchmark.py
@@ -57,13 +57,15 @@ trans_results = []
 trans_total_audio = 0.0
 trans_total_wall  = 0.0

+import json
+from src.transcriber import transcribe as _transcribe
+
 for ep in EPISODES:
    trans_ep_dir = TRANS_DIR / ep.stem
    trans_ep_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = trans_ep_dir / "transcript.json"

    if transcript_path.exists():
-        import json
        with open(transcript_path) as f:
            td = json.load(f)
        dur = td.get("duration", 0)
@@ -74,30 +76,15 @@ for ep in EPISODES:
    console.print(f"  Transcribing {ep.name}...")
    t0 = time.monotonic()

-    from faster_whisper import WhisperModel
-    if not hasattr(sys, "_whisper_model"):
-        console.print("  [dim]Loading Whisper large-v3...[/dim]")
-        sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16")
-
-    model = sys._whisper_model
-    segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5)
-
-    import json
-    segs = []
-    for seg in segments_iter:
-        segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text})
-
-    duration = info.duration
+    transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
    wall = time.monotonic() - t0
-    rtf  = duration / wall
+    rtf  = transcript.duration / wall

-    result = {"duration": duration, "language": "en", "segments": segs}
-    with open(transcript_path, "w") as f:
-        json.dump(result, f)
+    transcript.save(trans_ep_dir)

-    console.print(f"  [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
-    trans_results.append((ep, transcript_path, duration, wall))
-    trans_total_audio += duration
+    console.print(f"  [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
+    trans_results.append((ep, transcript_path, transcript.duration, wall))
+    trans_total_audio += transcript.duration
    trans_total_wall  += wall

 if trans_total_wall > 0:
--- a/projects/radio-show/audio-processor/build_cohost_profile.py
+++ b/projects/radio-show/audio-processor/build_cohost_profile.py
@@ -0,0 +1,115 @@
+"""
+Build voice profile for Tom (co-host) from known co-host speech windows.
+
+Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
+before any real callers would have called in.
+"""
+import os, sys
+os.environ["PYTHONIOENCODING"] = "utf-8"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8")
+
+from pathlib import Path
+import json
+import numpy as np
+from src.gpu import ensure_cuda_libs
+ensure_cuda_libs()
+
+import torch
+from src.voice_profiler import VoiceProfiler, SpeakerProfile
+from rich.console import Console
+
+console = Console()
+
+BASE = Path(__file__).parent
+PROFILES_DIR = BASE / "voice-profiles"
+EPISODES_DIR = BASE / "test-data" / "episodes"
+TRANS_DIR = BASE / "test-data" / "transcripts"
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+console.print(f"Device: {device}")
+
+profiler = VoiceProfiler(PROFILES_DIR, device=device)
+
+# Tom's known speech windows per episode
+# CALLER turns from diarization that are in the first 60 min (before real callers)
+# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
+TOM_WINDOWS = {
+    "2014-s6e19.mp3": [
+        (195, 260),
+        (320, 425),
+        (600, 650),
+        (675, 710),
+    ],
+    "2016-s8e43.mp3": [
+        (100, 115),
+        (135, 160),
+        (270, 295),
+        (575, 605),
+        (1185, 1235),
+        (1790, 1870),
+        (2020, 2055),
+    ],
+}
+
+COHOST_NAME = "Tom"
+
+if COHOST_NAME not in profiler.profiles:
+    profiler.profiles[COHOST_NAME] = SpeakerProfile(
+        name=COHOST_NAME,
+        role="cohost",
+        embeddings=[],
+        source_episodes=[],
+    )
+
+profile = profiler.profiles[COHOST_NAME]
+console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
+
+for ep_name, windows in TOM_WINDOWS.items():
+    ep_path = EPISODES_DIR / ep_name
+    if not ep_path.exists():
+        console.print(f"[yellow]  Skipping {ep_name} — not found[/yellow]")
+        continue
+
+    console.print(f"\n  Loading {ep_name}...")
+    audio = profiler._load_full_audio(ep_path)
+    profiler._get_model()
+
+    SAMPLE_RATE = 16000
+    chunk_s = 10.0
+    chunk_samples = int(chunk_s * SAMPLE_RATE)
+
+    for win_start, win_end in windows:
+        for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
+            chunk_end = chunk_start + int(chunk_s)
+            s = int(chunk_start * SAMPLE_RATE)
+            e = s + chunk_samples
+            if e > len(audio):
+                break
+            try:
+                emb = profiler._embed_audio_np(audio[s:e])
+                profile.embeddings.append(emb)
+                console.print(f"    [dim]+1 embedding @ {chunk_start}s[/dim]")
+            except Exception as ex:
+                console.print(f"    [red]Failed @ {chunk_start}s: {ex}[/red]")
+
+    profile.source_episodes.append(ep_name)
+
+if not profile.embeddings:
+    console.print("[red]No embeddings collected — check episode paths[/red]")
+    sys.exit(1)
+
+profile.compute_composite()
+console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
+              f"from {len(profile.source_episodes)} episodes[/green]")
+
+# Verify: check cosine similarity vs Mike to ensure separation
+mike = profiler.profiles.get("Mike Swanson")
+if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
+    sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
+                (np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
+    console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")
+
+profiler.save_profiles()
+console.print("[bold green]Profile saved.[/bold green]")
--- a/projects/radio-show/audio-processor/index_test_episodes.py
+++ b/projects/radio-show/audio-processor/index_test_episodes.py
@@ -0,0 +1,102 @@
+"""
+Index the 6 test episodes into archive.db.
+Reads pre-computed transcripts + diarization from test-data/transcripts/.
+"""
+import os, sys, re
+os.environ["PYTHONIOENCODING"] = "utf-8"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(encoding="utf-8")
+
+from pathlib import Path
+from src.indexer import ArchiveIndex
+from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
+from rich.console import Console
+from rich.table import Table
+
+console = Console()
+
+BASE      = Path(__file__).parent
+TRANS_DIR = BASE / "test-data" / "transcripts"
+EP_DIR    = BASE / "test-data" / "episodes"
+DB_PATH   = BASE / "archive.db"
+
+_DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})")
+
+
+def parse_episode_meta(ep_id: str) -> tuple[str, int | None]:
+    """Return (date_str_or_year, hr) from episode directory name."""
+    m = _DATE_RE.match(ep_id)
+    if m:
+        date = m.group(1)
+        hr = int(ep_id[-1]) if ep_id.endswith(("-hr1", "-hr2")) else None
+        return date, hr
+    # season/episode format e.g. 2016-s8e43 — use year only
+    year = ep_id[:4]
+    return year, None
+
+
+console.print(f"\n[bold]Indexing test episodes into {DB_PATH.name}[/bold]")
+
+with ArchiveIndex(DB_PATH) as idx:
+    rows = []
+
+    for ep_dir in sorted(TRANS_DIR.iterdir()):
+        t_path = ep_dir / "transcript.json"
+        d_path = ep_dir / "diarization.json"
+        if not t_path.exists():
+            continue
+
+        ep_id = ep_dir.name
+        date, hr = parse_episode_meta(ep_id)
+        audio_path = EP_DIR / f"{ep_id}.mp3"
+
+        # Episode duration from transcript
+        import json
+        with open(t_path) as f:
+            td = json.load(f)
+        duration = td.get("duration", 0)
+
+        # Register episode
+        idx.add_episode(
+            episode_id=ep_id,
+            audio_path=audio_path,
+            date=date,
+            duration=duration,
+            hr=hr,
+        )
+
+        # Load diarized segments and index
+        segs = load_diarized_transcript(t_path, d_path if d_path.exists() else None)
+        idx.add_segments(ep_id, segs)
+
+        # Extract and index Q&A pairs
+        pairs = extract_qa_pairs(segs)
+        for p in pairs:
+            idx.add_qa_pair(
+                episode_id=ep_id,
+                q_start=p.question_start, q_end=p.question_end,
+                a_start=p.answer_start,  a_end=p.answer_end,
+                question=p.question_text, answer=p.answer_text,
+                topic=p.topic, tags=p.topic_tags,
+            )
+
+        rows.append((ep_id, date, f"{duration:.0f}s", len(segs), len(pairs)))
+        console.print(f"  [green]{ep_id}[/green]: {len(segs)} segs, {len(pairs)} Q&A pairs")
+
+    stats = idx.stats()
+
+table = Table(title="Index Summary")
+table.add_column("Episode")
+table.add_column("Date")
+table.add_column("Duration")
+table.add_column("Segments")
+table.add_column("Q&A")
+for ep_id, date, dur, segs, qa in rows:
+    table.add_row(ep_id, date, dur, str(segs), str(qa))
+
+console.print()
+console.print(table)
+console.print(f"\n[bold]DB totals:[/bold] {stats['episodes']} episodes, "
+              f"{stats['segments']} segments, {stats['qa_pairs']} Q&A pairs")
+console.print(f"[dim]DB path: {DB_PATH}[/dim]")
--- a/projects/radio-show/audio-processor/src/diarizer.py
+++ b/projects/radio-show/audio-processor/src/diarizer.py
@@ -202,6 +202,8 @@ def diarize(audio_path: str | Path,
        label = seg.speaker_label.split(" (")[0]  # strip confidence score
        if label.startswith("Host:") or label.startswith("Host "):
            speaker = "HOST"
+        elif label.startswith("Cohost:"):
+            speaker = "CO-HOST"
        elif label == "[error]":
            speaker = "UNKNOWN"
        else:
--- a/projects/radio-show/audio-processor/src/qa_extractor.py
+++ b/projects/radio-show/audio-processor/src/qa_extractor.py
@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
    (re.compile(r"\bcomputer running slow\b",        re.I), 1),
    (re.compile(r"\bafter these messages\b",         re.I), 1),
    (re.compile(r"\b790.?2040\b",                   re.I), 1),
+    (re.compile(r"\b751.?1041\b",                   re.I), 1),
    (re.compile(r"\bgurushow\.com\b",               re.I), 1),
    (re.compile(r"\bcall in now\b",                  re.I), 1),
    (re.compile(r"\bcomputer troubles\?",            re.I), 1),
    (re.compile(r"\bhardware installation\b",        re.I), 1),
+    (re.compile(r"we.?ll get your problem solved",  re.I), 1),
 ]


@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
            if _is_promo_or_bumper(turn["text"]):
                i += 1
                continue
+            # Skip the opening 90s — real callers never call before the show starts
+            if turn["start"] < 90:
+                i += 1
+                continue
            q_duration = turn["end"] - turn["start"]
            if q_duration < MIN_QUESTION_DURATION:
                i += 1
                continue
+            # Require caller-intro context: host must have introduced the call, OR
+            # the caller opens with a phone greeting ("hello", "hi", "hey")
+            if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
+                i += 1
+                continue

            # Look ahead for HOST answer turn(s)
            j = i + 1
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
    with open(diarization_path) as f:
        diarization = json.load(f)

-    turns = diarization.get("turns", [])
+    raw_turns = diarization.get("turns", [])

-    def speaker_at(t: float) -> str:
-        """Find which diarization turn covers time t."""
+    # Resolve overlapping boundaries left by the sliding-window diarizer:
+    # place each transition at the midpoint of the overlap region.
+    resolved: list[dict] = []
+    for turn in sorted(raw_turns, key=lambda t: t["start"]):
+        if not resolved:
+            resolved.append(dict(turn))
+            continue
+        prev = resolved[-1]
+        if turn["start"] < prev["end"]:
+            mid = (turn["start"] + prev["end"]) / 2
+            prev["end"] = mid
+            resolved.append({**turn, "start": mid})
+        else:
+            resolved.append(dict(turn))
+    turns = resolved
+
+    # Minimum CALLER coverage to label a transcript segment as CALLER.
+    # Batch transcription produces ~25s segments; caller windows are 10s.
+    # Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
+    _CALLER_MIN_S = 4.0
+
+    def speaker_for_segment(seg_start: float, seg_end: float) -> str:
+        caller_cov = 0.0
+        coverage: dict[str, float] = {}
        for turn in turns:
-            if turn["start"] <= t <= turn["end"]:
-                return turn["speaker"]
+            overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
+            if overlap <= 0:
+                continue
+            coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
+            if turn["speaker"] == "CALLER":
+                caller_cov += overlap
+        if not coverage:
            return "UNKNOWN"
+        if caller_cov >= _CALLER_MIN_S:
+            return "CALLER"
+        return max(coverage, key=coverage.__getitem__)

    return [
        {"start": s["start"], "end": s["end"],
         "text": s["text"],
-         "speaker": speaker_at((s["start"] + s["end"]) / 2)}
+         "speaker": speaker_for_segment(s["start"], s["end"])}
        for s in segments
    ]


 # ── Helpers ────────────────────────────────────────────────────────────────

+_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
+
+
+def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
+    """Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
+    host_count = 0
+    for j in range(idx - 1, -1, -1):
+        if turns[j]["speaker"] == "HOST":
+            if _CALLER_INTRO.search(turns[j]["text"]):
+                return True
+            host_count += 1
+            if host_count >= max_host_turns:
+                break
+    return False
+
+
 def _looks_like_question(text: str) -> bool:
    return bool(QUESTION_PATTERN.search(text))

--- a/projects/radio-show/audio-processor/src/transcriber.py
+++ b/projects/radio-show/audio-processor/src/transcriber.py
@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:


 def transcribe(audio_path: str | Path, model_size: str = "large-v3",
-               language: str = "en", device: str = "cuda") -> Transcript:
-    """Transcribe an audio file using faster-whisper."""
-    from faster_whisper import WhisperModel
+               language: str = "en", device: str = "cuda",
+               batch_size: int = 16) -> Transcript:
+    """Transcribe an audio file using faster-whisper.
+
+    Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
+    Word timestamps are skipped in batch mode (not needed for segment-level search).
+    Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
+    """
+    from faster_whisper import WhisperModel, BatchedInferencePipeline

    audio_path = Path(audio_path)
+    use_batched = batch_size > 0
+
    console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
-    console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
+    console.print(
+        f"[dim]Model: {model_size} | "
+        f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
+        f"Device: {device}[/dim]"
+    )

+    if use_batched:
+        base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
+        model = BatchedInferencePipeline(model=base_model)
+        segments_raw, info = model.transcribe(
+            str(audio_path),
+            language=language,
+            batch_size=batch_size,
+        )
+    else:
        model = WhisperModel(model_size, device=device, compute_type="float16")
-
        segments_raw, info = model.transcribe(
            str(audio_path),
            language=language,
            word_timestamps=True,
            vad_filter=True,
-        vad_parameters=dict(
-            min_silence_duration_ms=500,
-            speech_pad_ms=200,
-        ),
+            vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
        )

-    console.print(f"[dim]Detected language: {info.language} "
-                  f"(probability: {info.language_probability:.2f})[/dim]")
-    console.print(f"[dim]Duration: {info.duration:.1f}s "
-                  f"({info.duration / 60:.1f} min)[/dim]")
+    console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")

    segments = []
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        BarColumn(),
-        TextColumn("{task.completed} segments"),
-        TimeElapsedColumn(),
-        console=console,
-    ) as progress:
-        task = progress.add_task("Transcribing...", total=None)
-
    for i, seg in enumerate(segments_raw):
+        words = []
+        if not use_batched:
            words = [
-                TranscriptWord(
-                    word=w.word,
-                    start=w.start,
-                    end=w.end,
-                    probability=w.probability,
-                )
+                TranscriptWord(word=w.word, start=w.start,
+                               end=w.end, probability=w.probability)
                for w in (seg.words or [])
            ]
        segments.append(TranscriptSegment(
-                id=i,
-                text=seg.text,
-                start=seg.start,
-                end=seg.end,
-                words=words,
+            id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
        ))
-            progress.update(task, completed=i + 1)
+        if i % 50 == 0:
+            console.print(f"[dim]  {i} segments... ({seg.end:.0f}s)[/dim]")

    console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")

--- a/projects/radio-show/audio-processor/src/voice_profiler.py
+++ b/projects/radio-show/audio-processor/src/voice_profiler.py
@@ -319,8 +319,11 @@ class VoiceProfiler:
                        best_match = name

                if best_score >= threshold:
-                    if best_match and self.profiles[best_match].role == "host":
+                    role = self.profiles[best_match].role if best_match else "unknown"
+                    if role == "host":
                        label = f"Host: {best_match}"
+                    elif role == "cohost":
+                        label = f"Cohost: {best_match}"
                    else:
                        label = best_match
                else:
--- a/projects/radio-show/audio-processor/voice-profiles/profiles.json
+++ b/projects/radio-show/audio-processor/voice-profiles/profiles.json
@@ -22,5 +22,13 @@
      "2018-s10e17.mp3",
      "2018-s10e21.mp3"
    ]
+  },
+  "Tom": {
+    "role": "cohost",
+    "num_samples": 44,
+    "source_episodes": [
+      "2014-s6e19.mp3",
+      "2016-s8e43.mp3"
+    ]
  }
 }
--- a/projects/radio-show/audio-processor/voice-profiles/tom/composite.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/composite.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0000.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0000.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0001.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0001.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0002.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0002.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0003.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0003.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0004.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0004.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0005.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0005.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0006.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0006.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0007.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0007.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0008.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0008.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0009.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0009.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0010.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0010.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0011.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0011.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0012.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0012.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0013.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0013.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0014.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0014.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0015.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0015.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0016.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0016.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0017.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0017.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0018.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0018.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0019.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0019.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0020.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0020.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0021.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0021.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0022.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0022.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0023.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0023.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0024.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0024.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0025.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0025.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0026.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0026.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0027.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0027.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0028.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0028.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0029.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0029.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0030.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0030.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0031.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0031.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0032.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0032.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0033.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0033.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0034.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0034.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0035.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0035.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0036.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0036.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0037.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0037.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0038.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0038.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0039.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0039.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0040.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0040.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0041.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0041.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0042.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0042.npy
--- a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0043.npy
+++ b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0043.npy
--- a/projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md
+++ b/projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md
@@ -0,0 +1,251 @@
+# Session Log: Q&A Extraction — Co-Host Profile + Archive Indexing
+**Date:** 2026-04-27
+**Project:** Radio Show Archive Mining — Computer Guru Show
+
+---
+
+## User
+- **User:** Mike Swanson (mike)
+- **Machine:** DESKTOP-0O8A1RL
+- **Role:** admin
+
+---
+
+## Session Summary
+
+The session began with resuming work following a benchmark run that demonstrated a significant performance improvement in Whisper transcription, achieving 63.8x real-time speed with batched inference and int8_float16 settings. Next, the focus shifted to evaluating the quality of Q&A extraction across six test episodes, revealing a critical issue with false positives due to co-host Tom being mislabeled as CALLER based on a voice similarity threshold.
+
+A co-host voice profile for Tom was constructed using 44 embeddings from two specific episodes (2014-s6e19 and 2016-s8e43), producing a cosine similarity of 0.698 against Mike — well below Mike's 0.85 threshold, giving clean separation. Code was updated in `voice_profiler.py` and `diarizer.py` to correctly emit "Cohost: Tom" labels and map them to a new "CO-HOST" speaker tag. Re-diarizing the two co-host-era episodes dramatically cleaned up Q&A results: 2016 went from 12 false positives to 2 real WiFi caller pairs.
+
+Several bugs in `qa_extractor.py` were fixed: overlap resolution for sliding-window diarization boundaries, CALLER-preference threshold for long batch transcript segments, and a turn-based caller-intro lookback to replace an ineffective 120s time window. Phone-greeting detection and new promo signatures were added. The final Q&A count landed at 10 pairs across 6 episodes, with 2014 correctly yielding 0 (gaming co-host episode with no actual callers).
+
+`archive.db` was created with the ArchiveIndex schema (episodes, segments, segments_fts, qa_pairs, qa_fts). All 6 test episodes were indexed: 762 segments, 10 Q&A pairs. FTS5 search verified working for "router", "Windows 10", "Internet Explorer", "antivirus", and "connect" queries.
+
+---
+
+## Key Decisions
+
+- **Co-host threshold uses same 0.85 bar as host**: Tom scores 0.698 vs Mike. Any voice >= 0.85 against Tom's composite gets labeled CO-HOST. Keeps the same single threshold for all profiles rather than per-profile thresholds.
+- **Turn-based lookback for caller-intro (2 HOST turns, not 120s)**: Long HOST monologue blocks (8-10 min) in big show segments meant time-based lookback missed the caller introduction. Previous 2 HOST turns always catches it regardless of block length.
+- **CALLER-preference at 4s minimum overlap**: Batch transcription produces ~26s segments; diarization CALLER windows are ~10s. Pure majority-vote always gave HOST. 4s minimum CALLER coverage labels the segment CALLER without being overly aggressive for co-host episodes.
+- **Midpoint boundary resolution at load time**: Rather than re-diarizing everything, the sliding-window overlap is resolved in `load_diarized_transcript()` so it applies retroactively to all saved diarization files without touching the JSON.
+- **751-1041 added as promo signal**: Earlier Tucson show number (vs 790-2040 in later seasons). Weighted 1 (needs a second semi-generic signal to filter).
+- **Tom's windows sourced from first 60 min of co-host episodes**: Real callers don't call in during the first hour of a 2-hour show (only exceptions: very end of show). First-hour CALLER windows are safely all Tom.
+
+---
+
+## Problems Encountered
+
+- **2016-s8e43 had 12 Q&A pairs, 11 false positives**: Root cause was Tom (co-host) labeled CALLER throughout. Fixed by building Tom's voice profile and re-diarizing.
+- **2014-s6e19 had 2 Q&A pairs from gaming discussion**: Same co-host issue. After re-diarization: 0 pairs (correct — no actual callers in that gaming special).
+- **2012-03-10 yielded 0 segments labeled CALLER**: Midpoint assignment hit HOST turns (HOST 0-20s and CALLER 15-30s — midpoint 15.1s falls in HOST). Fixed by overlap-preference assignment with 4s CALLER minimum.
+- **Real WiFi caller (2016, ~4794s) was missing after first fix attempt**: Aggressive time-based lookback (120s) combined with short CALLER turns from sliding-window diarization caused the caller question to land in a HOST segment. Fixed by turn-based lookback + co-host profile (eliminated Tom noise, letting real caller windows survive).
+- **2012-Jun pair at 1325s was a promo**: "The Computer Guru. We'll get your problem solved. Call 751-1041 today" passed promo filter. Fixed by adding 751-1041 and "we'll get your problem solved" as promo signatures.
+
+---
+
+## Files Created / Modified
+
+### New files
+```
+projects/radio-show/audio-processor/build_cohost_profile.py
+projects/radio-show/audio-processor/index_test_episodes.py
+projects/radio-show/audio-processor/archive.db
+projects/radio-show/audio-processor/voice-profiles/tom/
+projects/radio-show/audio-processor/voice-profiles/profiles.json  (updated: Tom added)
+projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md  (this file)
+```
+
+### Modified
+```
+src/voice_profiler.py       — emit "Cohost: <name>" label for cohost role
+src/diarizer.py             — map "Cohost:" prefix to "CO-HOST" speaker
+src/qa_extractor.py         — overlap resolution, CALLER-preference, turn-based
+                              caller-intro lookback, _preceded_by_caller_intro(),
+                              _PHONE_GREETING, 751-1041 + promo sig additions
+test-data/transcripts/2014-s6e19/diarization.json   (re-diarized with Tom profile)
+test-data/transcripts/2016-s8e43/diarization.json   (re-diarized with Tom profile)
+```
+
+---
+
+## Benchmark Results (from previous run — baseline for BEAST comparison)
+
+**Machine:** DESKTOP-0O8A1RL — NVIDIA GeForce RTX 5070 Ti Laptop GPU
+
+| Episode | Audio | Wall (diarize) | RTF |
+|---------|-------|----------------|-----|
+| 2011-03-12-hr1 | 2509s | 15.1s | 166.1x |
+| 2012-03-10-hr1 | 2634s | 12.2s | 215.5x |
+| 2012-06-09-hr1 | 2648s | 12.2s | 216.8x |
+| 2014-s6e19 | 2914s | 13.4s | 216.9x |
+| 2016-s8e43 | 5326s | 24.2s | 219.6x |
+| 2017-s9e30 | 5343s | 24.7s | 216.4x |
+| **TOTAL** | **21374s** | **101.9s** | **209.7x** |
+
+Transcription (batched Whisper large-v3): 63.8x realtime  
+Diarization: 209.7x realtime  
+vs DESKTOP-0O8A1RL baseline (149.5x): **+60.2x (+40.3%)**
+
+---
+
+## Archive DB State
+
+**Path:** `projects/radio-show/audio-processor/archive.db`
+
+```
+Episodes : 6
+Segments : 762
+Q&A pairs: 10
+```
+
+**Q&A pairs by episode:**
+| Episode | Pairs | Notes |
+|---------|-------|-------|
+| 2011-03-12-hr1 | 3 | IE lockout call, cloud computing, ghost hunting caller |
+| 2012-03-10-hr1 | 1 | iPad 3 discussion |
+| 2012-06-09-hr1 | 1 | Windows repair feature call |
+| 2014-s6e19 | 0 | Gaming co-host special — no actual callers |
+| 2016-s8e43 | 2 | WiFi connectivity caller (2 turns of same call) |
+| 2017-s9e30 | 3 | Software control, Cat5 cabling (Charlie), WiFi ports |
+
+---
+
+## Voice Profiles State
+
+**Path:** `projects/radio-show/audio-processor/voice-profiles/`
+
+| Name | Role | Embeddings | Source Episodes |
+|------|------|-----------|-----------------|
+| Mike Swanson | host | 180 | 9 episodes (2010-2018) |
+| Tom | cohost | 44 | 2014-s6e19, 2016-s8e43 |
+
+Tom vs Mike cosine similarity: **0.698** (well-separated at 0.85 threshold)
+
+**Tom's source windows used:**
+- 2014-s6e19: 195-260s, 320-425s, 600-650s, 675-710s
+- 2016-s8e43: 100-115s, 135-160s, 270-295s, 575-605s, 1185-1235s, 1790-1870s, 2020-2055s
+
+---
+
+## Co-Host Era Notes
+
+Tom was the regular in-studio co-host/board-op roughly 2013-2016. His voice is in episodes from at least 2014 through 2016 (confirmed from test set). The 2011 and 2012 episodes are pure call-in format with no co-host.
+
+If there are occasional guest co-hosts or fill-in hosts in other years, they would still be labeled CALLER until profiled. These would be rare and would likely not form question patterns that survive the caller-intro gate.
+
+---
+
+## Pending Tasks for BEAST (GURU-BEAST-ROG)
+
+### 1. Run benchmark.py to establish RTX 4090 baseline
+
+```bash
+cd D:/claudetools/projects/radio-show/audio-processor
+.venv/Scripts/python benchmark.py 2>&1 | tee bench-4090.txt
+```
+
+BENCH_SETUP.md has all setup steps. The voice profiles are in `voice-profiles/` (already copied or available via Tailscale/robocopy from DESKTOP-0O8A1RL). Test episodes go in `test-data/episodes/`.
+
+Expected: diarization RTF should be ~250-300x on RTX 4090 (vs 209.7x on laptop 5070 Ti). Transcription should be ~70-80x.
+
+Update `benchmark.py` line 27 after measuring:
+```python
+BASELINE_RTF  = 209.7  # current laptop 5070 Ti baseline
+```
+
+### 2. Download full archive from IX server (172.16.3.10)
+
+Use paramiko (SSH with key agent disabled):
+```python
+import paramiko
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect("172.16.3.10", username="gurushow", password="<from vault>",
+            look_for_keys=False, allow_agent=False)
+```
+
+Archive path: `/home/gurushow/public_html/archive/Radio/`
+Episode count: 579 MP3s across 2010-2018 (no 2013 season)
+Approximate total size: ~30-40 GB
+
+Download script skeleton in prior session log: `2026-04-27-diarization-pipeline.md`
+
+**Tailscale required** — IX server is at 172.16.3.10, requires VPN.
+
+### 3. Full archive processing
+
+Once episodes are downloaded:
+
+```bash
+# Transcribe + diarize all episodes
+cd D:/claudetools/projects/radio-show/audio-processor
+.venv/Scripts/python diarize_training.py  # or a new batch_process_all.py
+
+# Index everything into archive.db
+.venv/Scripts/python index_test_episodes.py  # modify to point at full episodes dir
+```
+
+The pipeline is idempotent — `add_segments()` skips episodes already indexed.
+
+### 4. Verify co-host era episodes
+
+2013-2016 era episodes should now correctly separate Tom (CO-HOST) from actual callers. Spot-check a few 2015 episodes after processing to confirm Tom's profile generalizes well.
+
+If any 2015/2016 episodes show too many CALLER turns that are clearly Tom (voice changed slightly over years), re-run `build_cohost_profile.py` with windows from that episode added to TOM_WINDOWS dict.
+
+---
+
+## Technical Reference
+
+### Key thresholds
+
+```python
+host_match_threshold = 0.85    # WavLM cosine similarity — applied to ALL profiles
+CALLER_MIN_S = 4.0             # min CALLER coverage in transcript segment to label CALLER
+PROMO_SCORE_THRESHOLD = 2      # weighted promo signature score
+MIN_QUESTION_DURATION = 5.0    # seconds
+MIN_ANSWER_DURATION = 15.0     # seconds
+MAX_GAP_BETWEEN_QA = 30.0      # seconds
+```
+
+### Diarization sliding window
+
+```python
+window_s = 10.0   # 10s embedding windows
+hop_s = 5.0       # 5s hop → overlapping boundaries (resolved at load time)
+```
+
+### Transcription (batch mode)
+
+```python
+model_size = "large-v3"
+compute_type = "int8_float16"
+batch_size = 16
+# No word timestamps in batch mode (not needed for search/diarization)
+```
+
+### DB search examples
+
+```python
+from src.indexer import ArchiveIndex
+from pathlib import Path
+
+with ArchiveIndex(Path("archive.db")) as idx:
+    # Segment search
+    results = idx.search("router", limit=20)
+    results = idx.search("Windows 10", speaker_filter="HOST", limit=10)
+
+    # Q&A search
+    qa = idx.search_qa("antivirus", limit=10)
+    qa = idx.search_qa("wifi connect", limit=10)
+```
+
+### Archive server
+
+```
+Host: 172.16.3.10 (requires Tailscale)
+User: gurushow
+Archive root: /home/gurushow/public_html/archive/Radio/
+SSH: paramiko with look_for_keys=False, allow_agent=False
+```