diff --git a/projects/radio-show/audio-processor/benchmark.py b/projects/radio-show/audio-processor/benchmark.py index 4ad30c3..2d8b06e 100644 --- a/projects/radio-show/audio-processor/benchmark.py +++ b/projects/radio-show/audio-processor/benchmark.py @@ -138,6 +138,38 @@ for ep, transcript_path, audio_dur, _ in trans_results: total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0 +# ── Phase 2.5: Speaker name resolution from transcript intros ─────────────── + +console.print("\n[bold]Phase 2.5: Name Resolution[/bold]") + +from src.speaker_oracle import resolve_from_files, named_speaker_summary +import json as _json + +for ep, transcript_path, audio_dur, _ in trans_results: + trans_ep_dir = TRANS_DIR / ep.stem + diarization_path = trans_ep_dir / "diarization.json" + if not diarization_path.exists(): + continue + + with open(transcript_path) as f: + td = _json.load(f) + duration = td.get("duration", audio_dur or 0) + + named = resolve_from_files(transcript_path, diarization_path) + summary = named_speaker_summary(named, duration) + + # Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN + resolved = {k: v for k, v in summary.items() + if k.startswith(("caller:", "guest:", "fillin:"))} + unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0) + + if resolved or unresolved_caller: + names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items()) + console.print( + f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})" + + (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "") + ) + # ── Phase 3: Q&A extraction ──────────────────────────────────────────────── console.print("\n[bold]Phase 3: Q&A Extraction[/bold]") diff --git a/projects/radio-show/audio-processor/src/speaker_oracle.py b/projects/radio-show/audio-processor/src/speaker_oracle.py new file mode 100644 index 0000000..e88ab69 --- /dev/null +++ b/projects/radio-show/audio-processor/src/speaker_oracle.py @@ -0,0 +1,268 @@ +""" +Transcript-driven speaker name resolution. + +Mike Swanson almost always announces who's about to speak — caller pickups +("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"), +co-host substitutions ("in Tara's place, we have Clay"), thank-yous on +caller close. These are deterministic ground-truth signals the audio-only +WavLM diarizer cannot use. + +This module: + 1. Extracts speaker introductions from a transcript. + 2. Binds each intro to the next non-HOST diarization turn. + 3. Returns named speaker turns, overriding incorrect cosine matches. + +Pipeline order: run AFTER diarization, the resolved names override the +HOST/CO-HOST/CALLER/BUMPER labels with concrete people. +""" +from __future__ import annotations +import re +import json +from dataclasses import dataclass, field +from pathlib import Path + +# ── Introduction patterns ──────────────────────────────────────────────────── +# Each: (regex, role_hint, name_group_index, optional affiliation_group) +# Patterns ordered most-specific first. + +_INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [ + # "in Tara's place, we have Clay" — fill-in for absent co-host + (re.compile( + r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)", + re.I), "fillin", 2, None), + + # "we have from " — guest intro + (re.compile( + r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))", + re.I | re.MULTILINE), "guest", 1, 2), + + # "let's talk to " / "let's go ahead and talk to " — caller pickup + (re.compile( + r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)", + re.I), "caller", 1, None), + + # "Hello, . How are you?" — caller pickup + (re.compile( + r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)", + re.I), "caller", 1, None), + + # "Hi , welcome/how are you" — caller pickup variant + (re.compile( + r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)", + re.I), "caller", 1, None), + + # "thanks (so much) for the call, " / "thanks for calling, " — caller close + # Require explicit "for the call/calling" — otherwise greedy matching captures + # any capitalized word after "thanks" (Continue, And, For, You, Man, etc.) + (re.compile( + r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b", + re.I), "caller_close", 1, None), + + # "joining us today/tonight (is|we have) " + (re.compile( + r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)", + re.I), "guest", 1, None), +] + +# Words that look like names but aren't real people we'd track. +# Includes show callouts, generic words, host self-references, common +# capitalized non-name words that survive mid-sentence in transcripts. +_NAME_BLACKLIST = { + "mike", "swanson", "computer", "guru", "windows", "office", "google", + "patreon", "tucson", "arizona", "facebook", "twitter", "youtube", + "what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes", + "no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday", + "thursday", "friday", "saturday", "sunday", "january", "february", + "march", "april", "may", "june", "july", "august", "september", + "october", "november", "december", "internet", "service", "feature", + "back", "show", "phone", "voice", "call", "kvoi", "kby", "ces", + "ipad", "iphone", "android", "samsung", "amazon", "intel", "amd", + "nvidia", "the", "and", "there", "but", "or", "so", "well", + "france", "germany", "japan", "china", "russia", "europe", "asia", + "africa", "elon", "musk", "trump", "biden", "obama", # public figures + "you", "for", "continue", "very", "buddy", "guys", "man", "god", + "everyone", "everybody", "anyone", +} + + +@dataclass +class SpeakerIntro: + name: str + intro_time: float # transcript segment end time — speaker turn likely starts after this + role_hint: str # "caller", "guest", "fillin", "caller_close" + affiliation: str | None = None + fillin_for: str | None = None # for "in X's place, we have Y" — X + source_text: str = "" + + +def _is_blacklisted(name: str) -> bool: + return name.lower() in _NAME_BLACKLIST or len(name) < 3 + + +def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]: + """Walk transcript segments and extract speaker introduction events. + + Deduplicates intros within 5 seconds with the same (name, role_hint). + """ + raw: list[SpeakerIntro] = [] + for seg in transcript_segments: + text = seg.get("text", "") + end = seg.get("end") or seg.get("start", 0) + for pat, role, name_idx, affil_idx in _INTRO_PATTERNS: + for m in pat.finditer(text): + captured = m.group(name_idx) + # Reject names that aren't capitalized in the source text — + # eliminates mid-sentence lowercase matches like "and", "there" + # that re.IGNORECASE picks up. + if not captured or not captured[0].isupper(): + continue + name = captured[0].upper() + captured[1:].lower() + if _is_blacklisted(name): + continue + affiliation = None + fillin_for = None + if role == "guest" and affil_idx: + try: + affiliation = m.group(affil_idx).strip().rstrip(".,;").title() + except (IndexError, AttributeError): + affiliation = None + if role == "fillin": + fillin_for = m.group(1).capitalize() + raw.append(SpeakerIntro( + name=name, + intro_time=float(end), + role_hint=role, + affiliation=affiliation, + fillin_for=fillin_for, + source_text=text.strip(), + )) + break # one intro per segment is plenty + + # Deduplicate: collapse same (name, role) within 5s window + raw.sort(key=lambda x: x.intro_time) + deduped: list[SpeakerIntro] = [] + for intro in raw: + if deduped: + prev = deduped[-1] + if (prev.name == intro.name + and prev.role_hint == intro.role_hint + and abs(intro.intro_time - prev.intro_time) <= 5.0): + continue + deduped.append(intro) + return deduped + + +@dataclass +class NamedTurn: + speaker: str # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN) + name: str | None # resolved name, or None + role_hint: str | None # "caller" / "guest" / "fillin" / etc. + start: float + end: float + confidence: float + intro_source: str | None = None # transcript phrase that drove the resolution + + +# Allow an intro to bind to a turn that started slightly before it +# (Whisper segment boundaries vs diarizer turn boundaries don't always +# align; sometimes Mike's "let's talk to Kay" lands a few seconds after +# Kay's first audio frame). +_INTRO_FORWARD_TOLERANCE_S = 8.0 +# For caller_close patterns, only bind if the close mention is within +# this many seconds AFTER the turn ends. +_CLOSE_LOOKAHEAD_S = 30.0 + + +def resolve_speakers(diarization_turns: list[dict], + intros: list[SpeakerIntro]) -> list[NamedTurn]: + """Assign speaker names to non-HOST diarization turns. + + Algorithm: + For each non-HOST turn T: + - Find the LATEST opening intro (caller / guest / fillin) at or + before T.start (with 8s forward tolerance to handle boundary + slop). No time limit — a later intro implicitly closes the + previous caller, so the most recent one wins. + - If no opening intro applies, look for a "thanks for the call X" + closure within 30s after T ends. + - If still none, leave the turn unresolved. + """ + # Sort intros by time so we can walk through linearly + opening = sorted( + [i for i in intros if i.role_hint != "caller_close"], + key=lambda i: i.intro_time, + ) + closing = sorted( + [i for i in intros if i.role_hint == "caller_close"], + key=lambda i: i.intro_time, + ) + + out: list[NamedTurn] = [] + for turn in diarization_turns: + speaker = turn["speaker"] + start = turn["start"] + end = turn["end"] + confidence = turn.get("confidence", 0.0) + + if speaker in ("HOST", "BUMPER", "UNKNOWN"): + out.append(NamedTurn( + speaker=speaker, name=None, role_hint=None, + start=start, end=end, confidence=confidence, + )) + continue + + # Latest opening intro at or before this turn (with forward slop) + best: SpeakerIntro | None = None + cutoff = start + _INTRO_FORWARD_TOLERANCE_S + for intro in opening: + if intro.intro_time <= cutoff: + best = intro # later intros override earlier + else: + break + + # If no opening, try a close pattern shortly AFTER turn ends + if best is None: + for intro in closing: + if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S: + best = intro + break + + if best is not None: + role = "caller" if best.role_hint == "caller_close" else best.role_hint + out.append(NamedTurn( + speaker=speaker, + name=best.name, + role_hint=role, + start=start, end=end, + confidence=confidence, + intro_source=best.source_text[:80], + )) + else: + out.append(NamedTurn( + speaker=speaker, name=None, role_hint=None, + start=start, end=end, confidence=confidence, + )) + + return out + + +def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]: + """Aggregate seconds-by-named-speaker for reporting.""" + times: dict[str, float] = {} + for t in named_turns: + if t.name: + key = f"{t.role_hint}: {t.name}" + else: + key = t.speaker + times[key] = times.get(key, 0) + (t.end - t.start) + return dict(sorted(times.items(), key=lambda x: -x[1])) + + +def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]: + """Convenience wrapper: load JSONs, run extraction + resolution.""" + with open(transcript_path) as f: + tdata = json.load(f) + with open(diarization_path) as f: + ddata = json.load(f) + intros = extract_intros(tdata.get("segments", [])) + return resolve_speakers(ddata.get("turns", []), intros)