radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike)
- diarizer.py: add CO-HOST speaker label for cohost-role profiles
- voice_profiler.py: emit "Cohost: <name>" label for cohost role
- qa_extractor.py: overlap resolution at load time (midpoint boundary split),
  4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns),
  _preceded_by_caller_intro() helper, _PHONE_GREETING pattern,
  751-1041 + "we'll get your problem solved" promo signatures
- benchmark.py: use src.transcriber.transcribe with batch_size=16
- add index_test_episodes.py and build_cohost_profile.py scripts
- add .gitignore (exclude episodes, transcripts, *.db, .venv)
- session log: 2026-04-27-qa-extraction-cohost-indexing.md

Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs.
archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions

View File

@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
(re.compile(r"\bcomputer running slow\b", re.I), 1),
(re.compile(r"\bafter these messages\b", re.I), 1),
(re.compile(r"\b790.?2040\b", re.I), 1),
(re.compile(r"\b751.?1041\b", re.I), 1),
(re.compile(r"\bgurushow\.com\b", re.I), 1),
(re.compile(r"\bcall in now\b", re.I), 1),
(re.compile(r"\bcomputer troubles\?", re.I), 1),
(re.compile(r"\bhardware installation\b", re.I), 1),
(re.compile(r"we.?ll get your problem solved", re.I), 1),
]
@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
if _is_promo_or_bumper(turn["text"]):
i += 1
continue
# Skip the opening 90s — real callers never call before the show starts
if turn["start"] < 90:
i += 1
continue
q_duration = turn["end"] - turn["start"]
if q_duration < MIN_QUESTION_DURATION:
i += 1
continue
# Require caller-intro context: host must have introduced the call, OR
# the caller opens with a phone greeting ("hello", "hi", "hey")
if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
i += 1
continue
# Look ahead for HOST answer turn(s)
j = i + 1
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
with open(diarization_path) as f:
diarization = json.load(f)
turns = diarization.get("turns", [])
raw_turns = diarization.get("turns", [])
def speaker_at(t: float) -> str:
"""Find which diarization turn covers time t."""
# Resolve overlapping boundaries left by the sliding-window diarizer:
# place each transition at the midpoint of the overlap region.
resolved: list[dict] = []
for turn in sorted(raw_turns, key=lambda t: t["start"]):
if not resolved:
resolved.append(dict(turn))
continue
prev = resolved[-1]
if turn["start"] < prev["end"]:
mid = (turn["start"] + prev["end"]) / 2
prev["end"] = mid
resolved.append({**turn, "start": mid})
else:
resolved.append(dict(turn))
turns = resolved
# Minimum CALLER coverage to label a transcript segment as CALLER.
# Batch transcription produces ~25s segments; caller windows are 10s.
# Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
_CALLER_MIN_S = 4.0
def speaker_for_segment(seg_start: float, seg_end: float) -> str:
caller_cov = 0.0
coverage: dict[str, float] = {}
for turn in turns:
if turn["start"] <= t <= turn["end"]:
return turn["speaker"]
return "UNKNOWN"
overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
if overlap <= 0:
continue
coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
if turn["speaker"] == "CALLER":
caller_cov += overlap
if not coverage:
return "UNKNOWN"
if caller_cov >= _CALLER_MIN_S:
return "CALLER"
return max(coverage, key=coverage.__getitem__)
return [
{"start": s["start"], "end": s["end"],
"text": s["text"],
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
"speaker": speaker_for_segment(s["start"], s["end"])}
for s in segments
]
# ── Helpers ────────────────────────────────────────────────────────────────
_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
"""Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
host_count = 0
for j in range(idx - 1, -1, -1):
if turns[j]["speaker"] == "HOST":
if _CALLER_INTRO.search(turns[j]["text"]):
return True
host_count += 1
if host_count >= max_host_turns:
break
return False
def _looks_like_question(text: str) -> bool:
return bool(QUESTION_PATTERN.search(text))