QAPair gets caller_name and caller_role fields populated by a new attach_caller_names(pairs, transcript_segments) helper. For each pair, finds the active opening intro at the question_start time (8s forward tolerance, no backward limit — a caller's call can run for 10+ minutes and the intro happens once at the start) and attaches the speaker name. Validation on 9-episode test set: 19/19 Q&A pairs (100%) now have caller names attached. Examples of corrections from oracle attribution: 2018-s10e18 @ 73:36 Christopher (was misattributed to "Tara") 2015-s7e19 @ 35:45 William (was misattributed to "Tara") 2010-05-08-hr1 Jackie x3, Bruce 2012-03-10-hr1 Adam x2 2016-s8e43 John, Doug 2017-s9e30 Tom, Denise x3, Charlie speaker_oracle.py: adds speaker_at(time, intros) helper used both by the existing resolve_speakers() and the new caller-name attachment. Also adds the "let's fit/bring/put X in/on" intro pattern variant (caught Charlie at 70:21 in 2017-s9e30 that "talk to X" missed). download_full_archive.py: SSH keepalive every 30s + per-file retry-on- failure (up to 3 attempts with reconnect). Earlier run hung on a dead connection at file 109 of 589 with no recovery; restarted run is now running at ~10 MB/s vs ~2-3 MB/s before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
294 lines
12 KiB
Python
294 lines
12 KiB
Python
"""
|
|
Transcript-driven speaker name resolution.
|
|
|
|
Mike Swanson almost always announces who's about to speak — caller pickups
|
|
("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"),
|
|
co-host substitutions ("in Tara's place, we have Clay"), thank-yous on
|
|
caller close. These are deterministic ground-truth signals the audio-only
|
|
WavLM diarizer cannot use.
|
|
|
|
This module:
|
|
1. Extracts speaker introductions from a transcript.
|
|
2. Binds each intro to the next non-HOST diarization turn.
|
|
3. Returns named speaker turns, overriding incorrect cosine matches.
|
|
|
|
Pipeline order: run AFTER diarization, the resolved names override the
|
|
HOST/CO-HOST/CALLER/BUMPER labels with concrete people.
|
|
"""
|
|
from __future__ import annotations
|
|
import re
|
|
import json
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
# ── Introduction patterns ────────────────────────────────────────────────────
|
|
# Each: (regex, role_hint, name_group_index, optional affiliation_group)
|
|
# Patterns ordered most-specific first.
|
|
|
|
_INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [
|
|
# "in Tara's place, we have Clay" — fill-in for absent co-host
|
|
(re.compile(
|
|
r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)",
|
|
re.I), "fillin", 2, None),
|
|
|
|
# "we have <name> from <affiliation>" — guest intro
|
|
(re.compile(
|
|
r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))",
|
|
re.I | re.MULTILINE), "guest", 1, 2),
|
|
|
|
# "let's talk to <name>" / "let's go ahead and talk to <name>" — caller pickup
|
|
(re.compile(
|
|
r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)",
|
|
re.I), "caller", 1, None),
|
|
|
|
# "let's fit <name> in" / "let's bring <name> on" — caller pickup variants
|
|
(re.compile(
|
|
r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:fit|bring|put)\s+([A-Z][a-z]+)\s+(?:in|on)\b",
|
|
re.I), "caller", 1, None),
|
|
|
|
# "Hello, <name>. How are you?" — caller pickup
|
|
(re.compile(
|
|
r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)",
|
|
re.I), "caller", 1, None),
|
|
|
|
# "Hi <name>, welcome/how are you" — caller pickup variant
|
|
(re.compile(
|
|
r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)",
|
|
re.I), "caller", 1, None),
|
|
|
|
# "thanks (so much) for the call, <name>" / "thanks for calling, <name>" — caller close
|
|
# Require explicit "for the call/calling" — otherwise greedy matching captures
|
|
# any capitalized word after "thanks" (Continue, And, For, You, Man, etc.)
|
|
(re.compile(
|
|
r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b",
|
|
re.I), "caller_close", 1, None),
|
|
|
|
# "joining us today/tonight (is|we have) <name>"
|
|
(re.compile(
|
|
r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)",
|
|
re.I), "guest", 1, None),
|
|
]
|
|
|
|
# Words that look like names but aren't real people we'd track.
|
|
# Includes show callouts, generic words, host self-references, common
|
|
# capitalized non-name words that survive mid-sentence in transcripts.
|
|
_NAME_BLACKLIST = {
|
|
"mike", "swanson", "computer", "guru", "windows", "office", "google",
|
|
"patreon", "tucson", "arizona", "facebook", "twitter", "youtube",
|
|
"what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes",
|
|
"no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday",
|
|
"thursday", "friday", "saturday", "sunday", "january", "february",
|
|
"march", "april", "may", "june", "july", "august", "september",
|
|
"october", "november", "december", "internet", "service", "feature",
|
|
"back", "show", "phone", "voice", "call", "kvoi", "kby", "ces",
|
|
"ipad", "iphone", "android", "samsung", "amazon", "intel", "amd",
|
|
"nvidia", "the", "and", "there", "but", "or", "so", "well",
|
|
"france", "germany", "japan", "china", "russia", "europe", "asia",
|
|
"africa", "elon", "musk", "trump", "biden", "obama", # public figures
|
|
"you", "for", "continue", "very", "buddy", "guys", "man", "god",
|
|
"everyone", "everybody", "anyone",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class SpeakerIntro:
|
|
name: str
|
|
intro_time: float # transcript segment end time — speaker turn likely starts after this
|
|
role_hint: str # "caller", "guest", "fillin", "caller_close"
|
|
affiliation: str | None = None
|
|
fillin_for: str | None = None # for "in X's place, we have Y" — X
|
|
source_text: str = ""
|
|
|
|
|
|
def _is_blacklisted(name: str) -> bool:
|
|
return name.lower() in _NAME_BLACKLIST or len(name) < 3
|
|
|
|
|
|
def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]:
|
|
"""Walk transcript segments and extract speaker introduction events.
|
|
|
|
Deduplicates intros within 5 seconds with the same (name, role_hint).
|
|
"""
|
|
raw: list[SpeakerIntro] = []
|
|
for seg in transcript_segments:
|
|
text = seg.get("text", "")
|
|
end = seg.get("end") or seg.get("start", 0)
|
|
for pat, role, name_idx, affil_idx in _INTRO_PATTERNS:
|
|
for m in pat.finditer(text):
|
|
captured = m.group(name_idx)
|
|
# Reject names that aren't capitalized in the source text —
|
|
# eliminates mid-sentence lowercase matches like "and", "there"
|
|
# that re.IGNORECASE picks up.
|
|
if not captured or not captured[0].isupper():
|
|
continue
|
|
name = captured[0].upper() + captured[1:].lower()
|
|
if _is_blacklisted(name):
|
|
continue
|
|
affiliation = None
|
|
fillin_for = None
|
|
if role == "guest" and affil_idx:
|
|
try:
|
|
affiliation = m.group(affil_idx).strip().rstrip(".,;").title()
|
|
except (IndexError, AttributeError):
|
|
affiliation = None
|
|
if role == "fillin":
|
|
fillin_for = m.group(1).capitalize()
|
|
raw.append(SpeakerIntro(
|
|
name=name,
|
|
intro_time=float(end),
|
|
role_hint=role,
|
|
affiliation=affiliation,
|
|
fillin_for=fillin_for,
|
|
source_text=text.strip(),
|
|
))
|
|
break # one intro per segment is plenty
|
|
|
|
# Deduplicate: collapse same (name, role) within 5s window
|
|
raw.sort(key=lambda x: x.intro_time)
|
|
deduped: list[SpeakerIntro] = []
|
|
for intro in raw:
|
|
if deduped:
|
|
prev = deduped[-1]
|
|
if (prev.name == intro.name
|
|
and prev.role_hint == intro.role_hint
|
|
and abs(intro.intro_time - prev.intro_time) <= 5.0):
|
|
continue
|
|
deduped.append(intro)
|
|
return deduped
|
|
|
|
|
|
@dataclass
|
|
class NamedTurn:
|
|
speaker: str # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN)
|
|
name: str | None # resolved name, or None
|
|
role_hint: str | None # "caller" / "guest" / "fillin" / etc.
|
|
start: float
|
|
end: float
|
|
confidence: float
|
|
intro_source: str | None = None # transcript phrase that drove the resolution
|
|
|
|
|
|
# Allow an intro to bind to a turn that started slightly before it
|
|
# (Whisper segment boundaries vs diarizer turn boundaries don't always
|
|
# align; sometimes Mike's "let's talk to Kay" lands a few seconds after
|
|
# Kay's first audio frame).
|
|
_INTRO_FORWARD_TOLERANCE_S = 8.0
|
|
# For caller_close patterns, only bind if the close mention is within
|
|
# this many seconds AFTER the turn ends.
|
|
_CLOSE_LOOKAHEAD_S = 30.0
|
|
|
|
|
|
def resolve_speakers(diarization_turns: list[dict],
|
|
intros: list[SpeakerIntro]) -> list[NamedTurn]:
|
|
"""Assign speaker names to non-HOST diarization turns.
|
|
|
|
Algorithm:
|
|
For each non-HOST turn T:
|
|
- Find the LATEST opening intro (caller / guest / fillin) at or
|
|
before T.start (with 8s forward tolerance to handle boundary
|
|
slop). No time limit — a later intro implicitly closes the
|
|
previous caller, so the most recent one wins.
|
|
- If no opening intro applies, look for a "thanks for the call X"
|
|
closure within 30s after T ends.
|
|
- If still none, leave the turn unresolved.
|
|
"""
|
|
# Sort intros by time so we can walk through linearly
|
|
opening = sorted(
|
|
[i for i in intros if i.role_hint != "caller_close"],
|
|
key=lambda i: i.intro_time,
|
|
)
|
|
closing = sorted(
|
|
[i for i in intros if i.role_hint == "caller_close"],
|
|
key=lambda i: i.intro_time,
|
|
)
|
|
|
|
out: list[NamedTurn] = []
|
|
for turn in diarization_turns:
|
|
speaker = turn["speaker"]
|
|
start = turn["start"]
|
|
end = turn["end"]
|
|
confidence = turn.get("confidence", 0.0)
|
|
|
|
if speaker in ("HOST", "BUMPER", "UNKNOWN"):
|
|
out.append(NamedTurn(
|
|
speaker=speaker, name=None, role_hint=None,
|
|
start=start, end=end, confidence=confidence,
|
|
))
|
|
continue
|
|
|
|
# Latest opening intro at or before this turn (with forward slop)
|
|
best: SpeakerIntro | None = None
|
|
cutoff = start + _INTRO_FORWARD_TOLERANCE_S
|
|
for intro in opening:
|
|
if intro.intro_time <= cutoff:
|
|
best = intro # later intros override earlier
|
|
else:
|
|
break
|
|
|
|
# If no opening, try a close pattern shortly AFTER turn ends
|
|
if best is None:
|
|
for intro in closing:
|
|
if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S:
|
|
best = intro
|
|
break
|
|
|
|
if best is not None:
|
|
role = "caller" if best.role_hint == "caller_close" else best.role_hint
|
|
out.append(NamedTurn(
|
|
speaker=speaker,
|
|
name=best.name,
|
|
role_hint=role,
|
|
start=start, end=end,
|
|
confidence=confidence,
|
|
intro_source=best.source_text[:80],
|
|
))
|
|
else:
|
|
out.append(NamedTurn(
|
|
speaker=speaker, name=None, role_hint=None,
|
|
start=start, end=end, confidence=confidence,
|
|
))
|
|
|
|
return out
|
|
|
|
|
|
def speaker_at(time: float, intros: list[SpeakerIntro]) -> SpeakerIntro | None:
|
|
"""Return the active opening intro at the given time, or None.
|
|
|
|
Same lookup logic as resolve_speakers but for a single timestamp,
|
|
used to attach caller names to Q&A pairs by their question_start time.
|
|
"""
|
|
opening = sorted(
|
|
[i for i in intros if i.role_hint != "caller_close"],
|
|
key=lambda i: i.intro_time,
|
|
)
|
|
best: SpeakerIntro | None = None
|
|
cutoff = time + _INTRO_FORWARD_TOLERANCE_S
|
|
for intro in opening:
|
|
if intro.intro_time <= cutoff:
|
|
best = intro
|
|
else:
|
|
break
|
|
return best
|
|
|
|
|
|
def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]:
|
|
"""Aggregate seconds-by-named-speaker for reporting."""
|
|
times: dict[str, float] = {}
|
|
for t in named_turns:
|
|
if t.name:
|
|
key = f"{t.role_hint}: {t.name}"
|
|
else:
|
|
key = t.speaker
|
|
times[key] = times.get(key, 0) + (t.end - t.start)
|
|
return dict(sorted(times.items(), key=lambda x: -x[1]))
|
|
|
|
|
|
def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]:
|
|
"""Convenience wrapper: load JSONs, run extraction + resolution."""
|
|
with open(transcript_path) as f:
|
|
tdata = json.load(f)
|
|
with open(diarization_path) as f:
|
|
ddata = json.load(f)
|
|
intros = extract_intros(tdata.get("segments", []))
|
|
return resolve_speakers(ddata.get("turns", []), intros)
|