Files
claudetools/projects/radio-show/audio-processor/src/speaker_oracle.py
Mike Swanson 488bf5849e radio: attach caller names to Q&A pairs from transcript intros
QAPair gets caller_name and caller_role fields populated by a new
attach_caller_names(pairs, transcript_segments) helper. For each pair,
finds the active opening intro at the question_start time (8s forward
tolerance, no backward limit — a caller's call can run for 10+ minutes
and the intro happens once at the start) and attaches the speaker name.

Validation on 9-episode test set:
  19/19 Q&A pairs (100%) now have caller names attached.

Examples of corrections from oracle attribution:
  2018-s10e18 @ 73:36  Christopher (was misattributed to "Tara")
  2015-s7e19 @ 35:45   William     (was misattributed to "Tara")
  2010-05-08-hr1       Jackie x3, Bruce
  2012-03-10-hr1       Adam x2
  2016-s8e43           John, Doug
  2017-s9e30           Tom, Denise x3, Charlie

speaker_oracle.py: adds speaker_at(time, intros) helper used both by the
existing resolve_speakers() and the new caller-name attachment. Also
adds the "let's fit/bring/put X in/on" intro pattern variant (caught
Charlie at 70:21 in 2017-s9e30 that "talk to X" missed).

download_full_archive.py: SSH keepalive every 30s + per-file retry-on-
failure (up to 3 attempts with reconnect). Earlier run hung on a dead
connection at file 109 of 589 with no recovery; restarted run is now
running at ~10 MB/s vs ~2-3 MB/s before.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:55:31 -07:00

294 lines
12 KiB
Python

"""
Transcript-driven speaker name resolution.
Mike Swanson almost always announces who's about to speak — caller pickups
("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"),
co-host substitutions ("in Tara's place, we have Clay"), thank-yous on
caller close. These are deterministic ground-truth signals the audio-only
WavLM diarizer cannot use.
This module:
1. Extracts speaker introductions from a transcript.
2. Binds each intro to the next non-HOST diarization turn.
3. Returns named speaker turns, overriding incorrect cosine matches.
Pipeline order: run AFTER diarization, the resolved names override the
HOST/CO-HOST/CALLER/BUMPER labels with concrete people.
"""
from __future__ import annotations
import re
import json
from dataclasses import dataclass, field
from pathlib import Path
# ── Introduction patterns ────────────────────────────────────────────────────
# Each: (regex, role_hint, name_group_index, optional affiliation_group)
# Patterns ordered most-specific first.
_INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [
# "in Tara's place, we have Clay" — fill-in for absent co-host
(re.compile(
r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)",
re.I), "fillin", 2, None),
# "we have <name> from <affiliation>" — guest intro
(re.compile(
r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))",
re.I | re.MULTILINE), "guest", 1, 2),
# "let's talk to <name>" / "let's go ahead and talk to <name>" — caller pickup
(re.compile(
r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)",
re.I), "caller", 1, None),
# "let's fit <name> in" / "let's bring <name> on" — caller pickup variants
(re.compile(
r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:fit|bring|put)\s+([A-Z][a-z]+)\s+(?:in|on)\b",
re.I), "caller", 1, None),
# "Hello, <name>. How are you?" — caller pickup
(re.compile(
r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)",
re.I), "caller", 1, None),
# "Hi <name>, welcome/how are you" — caller pickup variant
(re.compile(
r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)",
re.I), "caller", 1, None),
# "thanks (so much) for the call, <name>" / "thanks for calling, <name>" — caller close
# Require explicit "for the call/calling" — otherwise greedy matching captures
# any capitalized word after "thanks" (Continue, And, For, You, Man, etc.)
(re.compile(
r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b",
re.I), "caller_close", 1, None),
# "joining us today/tonight (is|we have) <name>"
(re.compile(
r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)",
re.I), "guest", 1, None),
]
# Words that look like names but aren't real people we'd track.
# Includes show callouts, generic words, host self-references, common
# capitalized non-name words that survive mid-sentence in transcripts.
_NAME_BLACKLIST = {
"mike", "swanson", "computer", "guru", "windows", "office", "google",
"patreon", "tucson", "arizona", "facebook", "twitter", "youtube",
"what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes",
"no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday",
"thursday", "friday", "saturday", "sunday", "january", "february",
"march", "april", "may", "june", "july", "august", "september",
"october", "november", "december", "internet", "service", "feature",
"back", "show", "phone", "voice", "call", "kvoi", "kby", "ces",
"ipad", "iphone", "android", "samsung", "amazon", "intel", "amd",
"nvidia", "the", "and", "there", "but", "or", "so", "well",
"france", "germany", "japan", "china", "russia", "europe", "asia",
"africa", "elon", "musk", "trump", "biden", "obama", # public figures
"you", "for", "continue", "very", "buddy", "guys", "man", "god",
"everyone", "everybody", "anyone",
}
@dataclass
class SpeakerIntro:
name: str
intro_time: float # transcript segment end time — speaker turn likely starts after this
role_hint: str # "caller", "guest", "fillin", "caller_close"
affiliation: str | None = None
fillin_for: str | None = None # for "in X's place, we have Y" — X
source_text: str = ""
def _is_blacklisted(name: str) -> bool:
return name.lower() in _NAME_BLACKLIST or len(name) < 3
def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]:
"""Walk transcript segments and extract speaker introduction events.
Deduplicates intros within 5 seconds with the same (name, role_hint).
"""
raw: list[SpeakerIntro] = []
for seg in transcript_segments:
text = seg.get("text", "")
end = seg.get("end") or seg.get("start", 0)
for pat, role, name_idx, affil_idx in _INTRO_PATTERNS:
for m in pat.finditer(text):
captured = m.group(name_idx)
# Reject names that aren't capitalized in the source text —
# eliminates mid-sentence lowercase matches like "and", "there"
# that re.IGNORECASE picks up.
if not captured or not captured[0].isupper():
continue
name = captured[0].upper() + captured[1:].lower()
if _is_blacklisted(name):
continue
affiliation = None
fillin_for = None
if role == "guest" and affil_idx:
try:
affiliation = m.group(affil_idx).strip().rstrip(".,;").title()
except (IndexError, AttributeError):
affiliation = None
if role == "fillin":
fillin_for = m.group(1).capitalize()
raw.append(SpeakerIntro(
name=name,
intro_time=float(end),
role_hint=role,
affiliation=affiliation,
fillin_for=fillin_for,
source_text=text.strip(),
))
break # one intro per segment is plenty
# Deduplicate: collapse same (name, role) within 5s window
raw.sort(key=lambda x: x.intro_time)
deduped: list[SpeakerIntro] = []
for intro in raw:
if deduped:
prev = deduped[-1]
if (prev.name == intro.name
and prev.role_hint == intro.role_hint
and abs(intro.intro_time - prev.intro_time) <= 5.0):
continue
deduped.append(intro)
return deduped
@dataclass
class NamedTurn:
speaker: str # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN)
name: str | None # resolved name, or None
role_hint: str | None # "caller" / "guest" / "fillin" / etc.
start: float
end: float
confidence: float
intro_source: str | None = None # transcript phrase that drove the resolution
# Allow an intro to bind to a turn that started slightly before it
# (Whisper segment boundaries vs diarizer turn boundaries don't always
# align; sometimes Mike's "let's talk to Kay" lands a few seconds after
# Kay's first audio frame).
_INTRO_FORWARD_TOLERANCE_S = 8.0
# For caller_close patterns, only bind if the close mention is within
# this many seconds AFTER the turn ends.
_CLOSE_LOOKAHEAD_S = 30.0
def resolve_speakers(diarization_turns: list[dict],
intros: list[SpeakerIntro]) -> list[NamedTurn]:
"""Assign speaker names to non-HOST diarization turns.
Algorithm:
For each non-HOST turn T:
- Find the LATEST opening intro (caller / guest / fillin) at or
before T.start (with 8s forward tolerance to handle boundary
slop). No time limit — a later intro implicitly closes the
previous caller, so the most recent one wins.
- If no opening intro applies, look for a "thanks for the call X"
closure within 30s after T ends.
- If still none, leave the turn unresolved.
"""
# Sort intros by time so we can walk through linearly
opening = sorted(
[i for i in intros if i.role_hint != "caller_close"],
key=lambda i: i.intro_time,
)
closing = sorted(
[i for i in intros if i.role_hint == "caller_close"],
key=lambda i: i.intro_time,
)
out: list[NamedTurn] = []
for turn in diarization_turns:
speaker = turn["speaker"]
start = turn["start"]
end = turn["end"]
confidence = turn.get("confidence", 0.0)
if speaker in ("HOST", "BUMPER", "UNKNOWN"):
out.append(NamedTurn(
speaker=speaker, name=None, role_hint=None,
start=start, end=end, confidence=confidence,
))
continue
# Latest opening intro at or before this turn (with forward slop)
best: SpeakerIntro | None = None
cutoff = start + _INTRO_FORWARD_TOLERANCE_S
for intro in opening:
if intro.intro_time <= cutoff:
best = intro # later intros override earlier
else:
break
# If no opening, try a close pattern shortly AFTER turn ends
if best is None:
for intro in closing:
if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S:
best = intro
break
if best is not None:
role = "caller" if best.role_hint == "caller_close" else best.role_hint
out.append(NamedTurn(
speaker=speaker,
name=best.name,
role_hint=role,
start=start, end=end,
confidence=confidence,
intro_source=best.source_text[:80],
))
else:
out.append(NamedTurn(
speaker=speaker, name=None, role_hint=None,
start=start, end=end, confidence=confidence,
))
return out
def speaker_at(time: float, intros: list[SpeakerIntro]) -> SpeakerIntro | None:
"""Return the active opening intro at the given time, or None.
Same lookup logic as resolve_speakers but for a single timestamp,
used to attach caller names to Q&A pairs by their question_start time.
"""
opening = sorted(
[i for i in intros if i.role_hint != "caller_close"],
key=lambda i: i.intro_time,
)
best: SpeakerIntro | None = None
cutoff = time + _INTRO_FORWARD_TOLERANCE_S
for intro in opening:
if intro.intro_time <= cutoff:
best = intro
else:
break
return best
def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]:
"""Aggregate seconds-by-named-speaker for reporting."""
times: dict[str, float] = {}
for t in named_turns:
if t.name:
key = f"{t.role_hint}: {t.name}"
else:
key = t.speaker
times[key] = times.get(key, 0) + (t.end - t.start)
return dict(sorted(times.items(), key=lambda x: -x[1]))
def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]:
"""Convenience wrapper: load JSONs, run extraction + resolution."""
with open(transcript_path) as f:
tdata = json.load(f)
with open(diarization_path) as f:
ddata = json.load(f)
intros = extract_intros(tdata.get("segments", []))
return resolve_speakers(ddata.get("turns", []), intros)