""" Transcript-driven speaker name resolution. Mike Swanson almost always announces who's about to speak — caller pickups ("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"), co-host substitutions ("in Tara's place, we have Clay"), thank-yous on caller close. These are deterministic ground-truth signals the audio-only WavLM diarizer cannot use. This module: 1. Extracts speaker introductions from a transcript. 2. Binds each intro to the next non-HOST diarization turn. 3. Returns named speaker turns, overriding incorrect cosine matches. Pipeline order: run AFTER diarization, the resolved names override the HOST/CO-HOST/CALLER/BUMPER labels with concrete people. """ from __future__ import annotations import re import json from dataclasses import dataclass, field from pathlib import Path # ── Introduction patterns ──────────────────────────────────────────────────── # Each: (regex, role_hint, name_group_index, optional affiliation_group) # Patterns ordered most-specific first. _INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [ # "in Tara's place, we have Clay" — fill-in for absent co-host (re.compile( r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)", re.I), "fillin", 2, None), # "we have from " — guest intro (re.compile( r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))", re.I | re.MULTILINE), "guest", 1, 2), # "let's talk to " / "let's go ahead and talk to " — caller pickup (re.compile( r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)", re.I), "caller", 1, None), # "let's fit in" / "let's bring on" — caller pickup variants (re.compile( r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:fit|bring|put)\s+([A-Z][a-z]+)\s+(?:in|on)\b", re.I), "caller", 1, None), # "Hello, . How are you?" — caller pickup (re.compile( r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)", re.I), "caller", 1, None), # "Hi , welcome/how are you" — caller pickup variant (re.compile( r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)", re.I), "caller", 1, None), # "thanks (so much) for the call, " / "thanks for calling, " — caller close # Require explicit "for the call/calling" — otherwise greedy matching captures # any capitalized word after "thanks" (Continue, And, For, You, Man, etc.) (re.compile( r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b", re.I), "caller_close", 1, None), # "joining us today/tonight (is|we have) " (re.compile( r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)", re.I), "guest", 1, None), ] # Words that look like names but aren't real people we'd track. # Includes show callouts, generic words, host self-references, common # capitalized non-name words that survive mid-sentence in transcripts. _NAME_BLACKLIST = { "mike", "swanson", "computer", "guru", "windows", "office", "google", "patreon", "tucson", "arizona", "facebook", "twitter", "youtube", "what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes", "no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "internet", "service", "feature", "back", "show", "phone", "voice", "call", "kvoi", "kby", "ces", "ipad", "iphone", "android", "samsung", "amazon", "intel", "amd", "nvidia", "the", "and", "there", "but", "or", "so", "well", "france", "germany", "japan", "china", "russia", "europe", "asia", "africa", "elon", "musk", "trump", "biden", "obama", # public figures "you", "for", "continue", "very", "buddy", "guys", "man", "god", "everyone", "everybody", "anyone", } @dataclass class SpeakerIntro: name: str intro_time: float # transcript segment end time — speaker turn likely starts after this role_hint: str # "caller", "guest", "fillin", "caller_close" affiliation: str | None = None fillin_for: str | None = None # for "in X's place, we have Y" — X source_text: str = "" def _is_blacklisted(name: str) -> bool: return name.lower() in _NAME_BLACKLIST or len(name) < 3 def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]: """Walk transcript segments and extract speaker introduction events. Deduplicates intros within 5 seconds with the same (name, role_hint). """ raw: list[SpeakerIntro] = [] for seg in transcript_segments: text = seg.get("text", "") end = seg.get("end") or seg.get("start", 0) for pat, role, name_idx, affil_idx in _INTRO_PATTERNS: for m in pat.finditer(text): captured = m.group(name_idx) # Reject names that aren't capitalized in the source text — # eliminates mid-sentence lowercase matches like "and", "there" # that re.IGNORECASE picks up. if not captured or not captured[0].isupper(): continue name = captured[0].upper() + captured[1:].lower() if _is_blacklisted(name): continue affiliation = None fillin_for = None if role == "guest" and affil_idx: try: affiliation = m.group(affil_idx).strip().rstrip(".,;").title() except (IndexError, AttributeError): affiliation = None if role == "fillin": fillin_for = m.group(1).capitalize() raw.append(SpeakerIntro( name=name, intro_time=float(end), role_hint=role, affiliation=affiliation, fillin_for=fillin_for, source_text=text.strip(), )) break # one intro per segment is plenty # Deduplicate: collapse same (name, role) within 5s window raw.sort(key=lambda x: x.intro_time) deduped: list[SpeakerIntro] = [] for intro in raw: if deduped: prev = deduped[-1] if (prev.name == intro.name and prev.role_hint == intro.role_hint and abs(intro.intro_time - prev.intro_time) <= 5.0): continue deduped.append(intro) return deduped @dataclass class NamedTurn: speaker: str # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN) name: str | None # resolved name, or None role_hint: str | None # "caller" / "guest" / "fillin" / etc. start: float end: float confidence: float intro_source: str | None = None # transcript phrase that drove the resolution # Allow an intro to bind to a turn that started slightly before it # (Whisper segment boundaries vs diarizer turn boundaries don't always # align; sometimes Mike's "let's talk to Kay" lands a few seconds after # Kay's first audio frame). _INTRO_FORWARD_TOLERANCE_S = 8.0 # For caller_close patterns, only bind if the close mention is within # this many seconds AFTER the turn ends. _CLOSE_LOOKAHEAD_S = 30.0 def resolve_speakers(diarization_turns: list[dict], intros: list[SpeakerIntro]) -> list[NamedTurn]: """Assign speaker names to non-HOST diarization turns. Algorithm: For each non-HOST turn T: - Find the LATEST opening intro (caller / guest / fillin) at or before T.start (with 8s forward tolerance to handle boundary slop). No time limit — a later intro implicitly closes the previous caller, so the most recent one wins. - If no opening intro applies, look for a "thanks for the call X" closure within 30s after T ends. - If still none, leave the turn unresolved. """ # Sort intros by time so we can walk through linearly opening = sorted( [i for i in intros if i.role_hint != "caller_close"], key=lambda i: i.intro_time, ) closing = sorted( [i for i in intros if i.role_hint == "caller_close"], key=lambda i: i.intro_time, ) out: list[NamedTurn] = [] for turn in diarization_turns: speaker = turn["speaker"] start = turn["start"] end = turn["end"] confidence = turn.get("confidence", 0.0) if speaker in ("HOST", "BUMPER", "UNKNOWN"): out.append(NamedTurn( speaker=speaker, name=None, role_hint=None, start=start, end=end, confidence=confidence, )) continue # Latest opening intro at or before this turn (with forward slop) best: SpeakerIntro | None = None cutoff = start + _INTRO_FORWARD_TOLERANCE_S for intro in opening: if intro.intro_time <= cutoff: best = intro # later intros override earlier else: break # If no opening, try a close pattern shortly AFTER turn ends if best is None: for intro in closing: if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S: best = intro break if best is not None: role = "caller" if best.role_hint == "caller_close" else best.role_hint out.append(NamedTurn( speaker=speaker, name=best.name, role_hint=role, start=start, end=end, confidence=confidence, intro_source=best.source_text[:80], )) else: out.append(NamedTurn( speaker=speaker, name=None, role_hint=None, start=start, end=end, confidence=confidence, )) return out def speaker_at(time: float, intros: list[SpeakerIntro]) -> SpeakerIntro | None: """Return the active opening intro at the given time, or None. Same lookup logic as resolve_speakers but for a single timestamp, used to attach caller names to Q&A pairs by their question_start time. """ opening = sorted( [i for i in intros if i.role_hint != "caller_close"], key=lambda i: i.intro_time, ) best: SpeakerIntro | None = None cutoff = time + _INTRO_FORWARD_TOLERANCE_S for intro in opening: if intro.intro_time <= cutoff: best = intro else: break return best def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]: """Aggregate seconds-by-named-speaker for reporting.""" times: dict[str, float] = {} for t in named_turns: if t.name: key = f"{t.role_hint}: {t.name}" else: key = t.speaker times[key] = times.get(key, 0) + (t.end - t.start) return dict(sorted(times.items(), key=lambda x: -x[1])) def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]: """Convenience wrapper: load JSONs, run extraction + resolution.""" with open(transcript_path) as f: tdata = json.load(f) with open(diarization_path) as f: ddata = json.load(f) intros = extract_intros(tdata.get("segments", [])) return resolve_speakers(ddata.get("turns", []), intros)