radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions
--- a/projects/radio-show/audio-processor/src/qa_extractor.py
+++ b/projects/radio-show/audio-processor/src/qa_extractor.py
@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
    (re.compile(r"\bcomputer running slow\b",        re.I), 1),
    (re.compile(r"\bafter these messages\b",         re.I), 1),
    (re.compile(r"\b790.?2040\b",                   re.I), 1),
+    (re.compile(r"\b751.?1041\b",                   re.I), 1),
    (re.compile(r"\bgurushow\.com\b",               re.I), 1),
    (re.compile(r"\bcall in now\b",                  re.I), 1),
    (re.compile(r"\bcomputer troubles\?",            re.I), 1),
    (re.compile(r"\bhardware installation\b",        re.I), 1),
+    (re.compile(r"we.?ll get your problem solved",  re.I), 1),
 ]


@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
            if _is_promo_or_bumper(turn["text"]):
                i += 1
                continue
+            # Skip the opening 90s — real callers never call before the show starts
+            if turn["start"] < 90:
+                i += 1
+                continue
            q_duration = turn["end"] - turn["start"]
            if q_duration < MIN_QUESTION_DURATION:
                i += 1
                continue
+            # Require caller-intro context: host must have introduced the call, OR
+            # the caller opens with a phone greeting ("hello", "hi", "hey")
+            if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
+                i += 1
+                continue

            # Look ahead for HOST answer turn(s)
            j = i + 1
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
    with open(diarization_path) as f:
        diarization = json.load(f)

-    turns = diarization.get("turns", [])
+    raw_turns = diarization.get("turns", [])

-    def speaker_at(t: float) -> str:
-        """Find which diarization turn covers time t."""
+    # Resolve overlapping boundaries left by the sliding-window diarizer:
+    # place each transition at the midpoint of the overlap region.
+    resolved: list[dict] = []
+    for turn in sorted(raw_turns, key=lambda t: t["start"]):
+        if not resolved:
+            resolved.append(dict(turn))
+            continue
+        prev = resolved[-1]
+        if turn["start"] < prev["end"]:
+            mid = (turn["start"] + prev["end"]) / 2
+            prev["end"] = mid
+            resolved.append({**turn, "start": mid})
+        else:
+            resolved.append(dict(turn))
+    turns = resolved
+
+    # Minimum CALLER coverage to label a transcript segment as CALLER.
+    # Batch transcription produces ~25s segments; caller windows are 10s.
+    # Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
+    _CALLER_MIN_S = 4.0
+
+    def speaker_for_segment(seg_start: float, seg_end: float) -> str:
+        caller_cov = 0.0
+        coverage: dict[str, float] = {}
        for turn in turns:
-            if turn["start"] <= t <= turn["end"]:
-                return turn["speaker"]
-        return "UNKNOWN"
+            overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
+            if overlap <= 0:
+                continue
+            coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
+            if turn["speaker"] == "CALLER":
+                caller_cov += overlap
+        if not coverage:
+            return "UNKNOWN"
+        if caller_cov >= _CALLER_MIN_S:
+            return "CALLER"
+        return max(coverage, key=coverage.__getitem__)

    return [
        {"start": s["start"], "end": s["end"],
         "text": s["text"],
-         "speaker": speaker_at((s["start"] + s["end"]) / 2)}
+         "speaker": speaker_for_segment(s["start"], s["end"])}
        for s in segments
    ]


 # ── Helpers ────────────────────────────────────────────────────────────────

+_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
+
+
+def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
+    """Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
+    host_count = 0
+    for j in range(idx - 1, -1, -1):
+        if turns[j]["speaker"] == "HOST":
+            if _CALLER_INTRO.search(turns[j]["text"]):
+                return True
+            host_count += 1
+            if host_count >= max_host_turns:
+                break
+    return False
+
+
 def _looks_like_question(text: str) -> bool:
    return bool(QUESTION_PATTERN.search(text))