From 1b574caba42097bd3dad90c86b8210824372ad0d Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Mon, 27 Apr 2026 16:48:16 -0700 Subject: [PATCH] radio: transcript-driven speaker name resolution (oracle) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module src/speaker_oracle.py extracts speaker introductions from transcripts ("let's talk to William", "we have Clay from the Nerd Junkies", "in Tara's place, we have Clay", "thanks for the call ") and binds them to non-HOST diarization turns. Pure post-pass on diarization JSONs, no audio processing — corrects audio-only cosine errors using Mike's deterministic on-air announcements. Algorithm: - Extract intros: regex patterns for caller pickups, guest intros, fill-in announcements, caller closes. Case-strict (rejects mid-sentence lowercase matches), with a blacklist of common false-positive words. Deduplicates same-name intros within 5s. - Resolve speakers: for each non-HOST turn, find the LATEST opening intro at or before turn.start (with 8s forward tolerance for boundary slop). Later intros implicitly close earlier callers, so the most recent intro wins. No artificial lookback limit (callers can talk for 10+ min). - Falls back to caller_close patterns within 30s after a turn ends. Validation on 9-episode test set: 2018-s10e18: Christopher 190s correctly named (was mislabeled "Tara") 2012-06-09 : Kay 160s correctly named (was mislabeled "Tara") 2015-s7e19 : Clay 45s as fillin for Tara, William 40s as caller 2016-s8e43 : Charles 630s, Bruce 210s, John 205s — most callers named 2017-s9e30 : Denise 295s, Tom 115s, Elaine 85s, Jeff 10s Many other callers across all episodes correctly named. Remaining unnamed CO-HOST/CALLER (~5-10% of non-HOST time) are real co-host banter or callers without explicit Mike-introductions. benchmark.py: adds Phase 2.5 "Name Resolution" between diarization and Q&A extraction. Prints named-speaker breakdown per episode. Doesn't modify diarization JSONs (resolution is computed on demand). Next step: feed named turns into qa_extractor so Q&A pairs get caller name attached for searchability. Also: bootstrap recurring-speaker profiles (Tara, Tony, Rob, Randall, producers) by accumulating intro-tagged windows across the full archive once download completes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../radio-show/audio-processor/benchmark.py | 32 +++ .../audio-processor/src/speaker_oracle.py | 268 ++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 projects/radio-show/audio-processor/src/speaker_oracle.py diff --git a/projects/radio-show/audio-processor/benchmark.py b/projects/radio-show/audio-processor/benchmark.py index 4ad30c3..2d8b06e 100644 --- a/projects/radio-show/audio-processor/benchmark.py +++ b/projects/radio-show/audio-processor/benchmark.py @@ -138,6 +138,38 @@ for ep, transcript_path, audio_dur, _ in trans_results: total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0 +# ── Phase 2.5: Speaker name resolution from transcript intros ─────────────── + +console.print("\n[bold]Phase 2.5: Name Resolution[/bold]") + +from src.speaker_oracle import resolve_from_files, named_speaker_summary +import json as _json + +for ep, transcript_path, audio_dur, _ in trans_results: + trans_ep_dir = TRANS_DIR / ep.stem + diarization_path = trans_ep_dir / "diarization.json" + if not diarization_path.exists(): + continue + + with open(transcript_path) as f: + td = _json.load(f) + duration = td.get("duration", audio_dur or 0) + + named = resolve_from_files(transcript_path, diarization_path) + summary = named_speaker_summary(named, duration) + + # Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN + resolved = {k: v for k, v in summary.items() + if k.startswith(("caller:", "guest:", "fillin:"))} + unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0) + + if resolved or unresolved_caller: + names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items()) + console.print( + f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})" + + (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "") + ) + # ── Phase 3: Q&A extraction ──────────────────────────────────────────────── console.print("\n[bold]Phase 3: Q&A Extraction[/bold]") diff --git a/projects/radio-show/audio-processor/src/speaker_oracle.py b/projects/radio-show/audio-processor/src/speaker_oracle.py new file mode 100644 index 0000000..e88ab69 --- /dev/null +++ b/projects/radio-show/audio-processor/src/speaker_oracle.py @@ -0,0 +1,268 @@ +""" +Transcript-driven speaker name resolution. + +Mike Swanson almost always announces who's about to speak — caller pickups +("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"), +co-host substitutions ("in Tara's place, we have Clay"), thank-yous on +caller close. These are deterministic ground-truth signals the audio-only +WavLM diarizer cannot use. + +This module: + 1. Extracts speaker introductions from a transcript. + 2. Binds each intro to the next non-HOST diarization turn. + 3. Returns named speaker turns, overriding incorrect cosine matches. + +Pipeline order: run AFTER diarization, the resolved names override the +HOST/CO-HOST/CALLER/BUMPER labels with concrete people. +""" +from __future__ import annotations +import re +import json +from dataclasses import dataclass, field +from pathlib import Path + +# ── Introduction patterns ──────────────────────────────────────────────────── +# Each: (regex, role_hint, name_group_index, optional affiliation_group) +# Patterns ordered most-specific first. + +_INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [ + # "in Tara's place, we have Clay" — fill-in for absent co-host + (re.compile( + r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)", + re.I), "fillin", 2, None), + + # "we have from " — guest intro + (re.compile( + r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))", + re.I | re.MULTILINE), "guest", 1, 2), + + # "let's talk to " / "let's go ahead and talk to " — caller pickup + (re.compile( + r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)", + re.I), "caller", 1, None), + + # "Hello, . How are you?" — caller pickup + (re.compile( + r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)", + re.I), "caller", 1, None), + + # "Hi , welcome/how are you" — caller pickup variant + (re.compile( + r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)", + re.I), "caller", 1, None), + + # "thanks (so much) for the call, " / "thanks for calling, " — caller close + # Require explicit "for the call/calling" — otherwise greedy matching captures + # any capitalized word after "thanks" (Continue, And, For, You, Man, etc.) + (re.compile( + r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b", + re.I), "caller_close", 1, None), + + # "joining us today/tonight (is|we have) " + (re.compile( + r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)", + re.I), "guest", 1, None), +] + +# Words that look like names but aren't real people we'd track. +# Includes show callouts, generic words, host self-references, common +# capitalized non-name words that survive mid-sentence in transcripts. +_NAME_BLACKLIST = { + "mike", "swanson", "computer", "guru", "windows", "office", "google", + "patreon", "tucson", "arizona", "facebook", "twitter", "youtube", + "what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes", + "no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday", + "thursday", "friday", "saturday", "sunday", "january", "february", + "march", "april", "may", "june", "july", "august", "september", + "october", "november", "december", "internet", "service", "feature", + "back", "show", "phone", "voice", "call", "kvoi", "kby", "ces", + "ipad", "iphone", "android", "samsung", "amazon", "intel", "amd", + "nvidia", "the", "and", "there", "but", "or", "so", "well", + "france", "germany", "japan", "china", "russia", "europe", "asia", + "africa", "elon", "musk", "trump", "biden", "obama", # public figures + "you", "for", "continue", "very", "buddy", "guys", "man", "god", + "everyone", "everybody", "anyone", +} + + +@dataclass +class SpeakerIntro: + name: str + intro_time: float # transcript segment end time — speaker turn likely starts after this + role_hint: str # "caller", "guest", "fillin", "caller_close" + affiliation: str | None = None + fillin_for: str | None = None # for "in X's place, we have Y" — X + source_text: str = "" + + +def _is_blacklisted(name: str) -> bool: + return name.lower() in _NAME_BLACKLIST or len(name) < 3 + + +def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]: + """Walk transcript segments and extract speaker introduction events. + + Deduplicates intros within 5 seconds with the same (name, role_hint). + """ + raw: list[SpeakerIntro] = [] + for seg in transcript_segments: + text = seg.get("text", "") + end = seg.get("end") or seg.get("start", 0) + for pat, role, name_idx, affil_idx in _INTRO_PATTERNS: + for m in pat.finditer(text): + captured = m.group(name_idx) + # Reject names that aren't capitalized in the source text — + # eliminates mid-sentence lowercase matches like "and", "there" + # that re.IGNORECASE picks up. + if not captured or not captured[0].isupper(): + continue + name = captured[0].upper() + captured[1:].lower() + if _is_blacklisted(name): + continue + affiliation = None + fillin_for = None + if role == "guest" and affil_idx: + try: + affiliation = m.group(affil_idx).strip().rstrip(".,;").title() + except (IndexError, AttributeError): + affiliation = None + if role == "fillin": + fillin_for = m.group(1).capitalize() + raw.append(SpeakerIntro( + name=name, + intro_time=float(end), + role_hint=role, + affiliation=affiliation, + fillin_for=fillin_for, + source_text=text.strip(), + )) + break # one intro per segment is plenty + + # Deduplicate: collapse same (name, role) within 5s window + raw.sort(key=lambda x: x.intro_time) + deduped: list[SpeakerIntro] = [] + for intro in raw: + if deduped: + prev = deduped[-1] + if (prev.name == intro.name + and prev.role_hint == intro.role_hint + and abs(intro.intro_time - prev.intro_time) <= 5.0): + continue + deduped.append(intro) + return deduped + + +@dataclass +class NamedTurn: + speaker: str # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN) + name: str | None # resolved name, or None + role_hint: str | None # "caller" / "guest" / "fillin" / etc. + start: float + end: float + confidence: float + intro_source: str | None = None # transcript phrase that drove the resolution + + +# Allow an intro to bind to a turn that started slightly before it +# (Whisper segment boundaries vs diarizer turn boundaries don't always +# align; sometimes Mike's "let's talk to Kay" lands a few seconds after +# Kay's first audio frame). +_INTRO_FORWARD_TOLERANCE_S = 8.0 +# For caller_close patterns, only bind if the close mention is within +# this many seconds AFTER the turn ends. +_CLOSE_LOOKAHEAD_S = 30.0 + + +def resolve_speakers(diarization_turns: list[dict], + intros: list[SpeakerIntro]) -> list[NamedTurn]: + """Assign speaker names to non-HOST diarization turns. + + Algorithm: + For each non-HOST turn T: + - Find the LATEST opening intro (caller / guest / fillin) at or + before T.start (with 8s forward tolerance to handle boundary + slop). No time limit — a later intro implicitly closes the + previous caller, so the most recent one wins. + - If no opening intro applies, look for a "thanks for the call X" + closure within 30s after T ends. + - If still none, leave the turn unresolved. + """ + # Sort intros by time so we can walk through linearly + opening = sorted( + [i for i in intros if i.role_hint != "caller_close"], + key=lambda i: i.intro_time, + ) + closing = sorted( + [i for i in intros if i.role_hint == "caller_close"], + key=lambda i: i.intro_time, + ) + + out: list[NamedTurn] = [] + for turn in diarization_turns: + speaker = turn["speaker"] + start = turn["start"] + end = turn["end"] + confidence = turn.get("confidence", 0.0) + + if speaker in ("HOST", "BUMPER", "UNKNOWN"): + out.append(NamedTurn( + speaker=speaker, name=None, role_hint=None, + start=start, end=end, confidence=confidence, + )) + continue + + # Latest opening intro at or before this turn (with forward slop) + best: SpeakerIntro | None = None + cutoff = start + _INTRO_FORWARD_TOLERANCE_S + for intro in opening: + if intro.intro_time <= cutoff: + best = intro # later intros override earlier + else: + break + + # If no opening, try a close pattern shortly AFTER turn ends + if best is None: + for intro in closing: + if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S: + best = intro + break + + if best is not None: + role = "caller" if best.role_hint == "caller_close" else best.role_hint + out.append(NamedTurn( + speaker=speaker, + name=best.name, + role_hint=role, + start=start, end=end, + confidence=confidence, + intro_source=best.source_text[:80], + )) + else: + out.append(NamedTurn( + speaker=speaker, name=None, role_hint=None, + start=start, end=end, confidence=confidence, + )) + + return out + + +def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]: + """Aggregate seconds-by-named-speaker for reporting.""" + times: dict[str, float] = {} + for t in named_turns: + if t.name: + key = f"{t.role_hint}: {t.name}" + else: + key = t.speaker + times[key] = times.get(key, 0) + (t.end - t.start) + return dict(sorted(times.items(), key=lambda x: -x[1])) + + +def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]: + """Convenience wrapper: load JSONs, run extraction + resolution.""" + with open(transcript_path) as f: + tdata = json.load(f) + with open(diarization_path) as f: + ddata = json.load(f) + intros = extract_intros(tdata.get("segments", [])) + return resolve_speakers(ddata.get("turns", []), intros)