claudetools/projects/radio-show/audio-processor/src/speaker_oracle.py

"""
Transcript-driven speaker name resolution.

Mike Swanson almost always announces who's about to speak — caller pickups
("let's talk to William"), guest intros ("we have Clay from the Nerd Junkies"),
co-host substitutions ("in Tara's place, we have Clay"), thank-yous on
caller close. These are deterministic ground-truth signals the audio-only
WavLM diarizer cannot use.

This module:
  1. Extracts speaker introductions from a transcript.
  2. Binds each intro to the next non-HOST diarization turn.
  3. Returns named speaker turns, overriding incorrect cosine matches.

Pipeline order: run AFTER diarization, the resolved names override the
HOST/CO-HOST/CALLER/BUMPER labels with concrete people.
"""
from __future__ import annotations
import re
import json
from dataclasses import dataclass, field
from pathlib import Path

# ── Introduction patterns ────────────────────────────────────────────────────
# Each: (regex, role_hint, name_group_index, optional affiliation_group)
# Patterns ordered most-specific first.

_INTRO_PATTERNS: list[tuple[re.Pattern, str, int, int | None]] = [
    # "in Tara's place, we have Clay" — fill-in for absent co-host
    (re.compile(
        r"in\s+([A-Z][a-z]+).?s\s+place,?\s+we\s+have\s+([A-Z][a-z]+)",
        re.I), "fillin", 2, None),

    # "we have <name> from <affiliation>" — guest intro
    (re.compile(
        r"\bwe\s+have\s+([A-Z][a-z]+)\s+(?:from|with)\s+(?:the\s+)?([A-Z][\w\s]{2,30}?)(?:\.|,|;|\s+(?:on|here|joining|today|tonight|with))",
        re.I | re.MULTILINE), "guest", 1, 2),

    # "let's talk to <name>" / "let's go ahead and talk to <name>" — caller pickup
    (re.compile(
        r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:talk|go)\s+to\s+([A-Z][a-z]+)",
        re.I), "caller", 1, None),

    # "let's fit <name> in" / "let's bring <name> on" — caller pickup variants
    (re.compile(
        r"\blet.s\s+(?:go\s+ahead\s+and\s+)?(?:fit|bring|put)\s+([A-Z][a-z]+)\s+(?:in|on)\b",
        re.I), "caller", 1, None),

    # "Hello, <name>. How are you?" — caller pickup
    (re.compile(
        r"\bhello,?\s+([A-Z][a-z]+)\.?\s+(?:how\s+are\s+you|thanks\s+for|are\s+you\s+there|welcome)",
        re.I), "caller", 1, None),

    # "Hi <name>, welcome/how are you" — caller pickup variant
    (re.compile(
        r"\bhi\s+([A-Z][a-z]+),?\s+(?:welcome|how\s+are\s+you|thanks\s+for)",
        re.I), "caller", 1, None),

    # "thanks (so much) for the call, <name>" / "thanks for calling, <name>" — caller close
    # Require explicit "for the call/calling" — otherwise greedy matching captures
    # any capitalized word after "thanks" (Continue, And, For, You, Man, etc.)
    (re.compile(
        r"\bthanks?(?:\s+so\s+much)?\s+for\s+(?:the\s+)?(?:call(?:ing)?|patience),?\s+([A-Z][a-z]+)\b",
        re.I), "caller_close", 1, None),

    # "joining us today/tonight (is|we have) <name>"
    (re.compile(
        r"\bjoining\s+(?:us|me)\s+(?:today|tonight|this\s+morning)?\s*(?:is|we\s+have)\s+([A-Z][a-z]+)",
        re.I), "guest", 1, None),
]

# Words that look like names but aren't real people we'd track.
# Includes show callouts, generic words, host self-references, common
# capitalized non-name words that survive mid-sentence in transcripts.
_NAME_BLACKLIST = {
    "mike", "swanson", "computer", "guru", "windows", "office", "google",
    "patreon", "tucson", "arizona", "facebook", "twitter", "youtube",
    "what", "how", "now", "well", "okay", "right", "sure", "yeah", "yes",
    "no", "thanks", "today", "tonight", "monday", "tuesday", "wednesday",
    "thursday", "friday", "saturday", "sunday", "january", "february",
    "march", "april", "may", "june", "july", "august", "september",
    "october", "november", "december", "internet", "service", "feature",
    "back", "show", "phone", "voice", "call", "kvoi", "kby", "ces",
    "ipad", "iphone", "android", "samsung", "amazon", "intel", "amd",
    "nvidia", "the", "and", "there", "but", "or", "so", "well",
    "france", "germany", "japan", "china", "russia", "europe", "asia",
    "africa", "elon", "musk", "trump", "biden", "obama",  # public figures
    "you", "for", "continue", "very", "buddy", "guys", "man", "god",
    "everyone", "everybody", "anyone",
}


@dataclass
class SpeakerIntro:
    name: str
    intro_time: float  # transcript segment end time — speaker turn likely starts after this
    role_hint: str  # "caller", "guest", "fillin", "caller_close"
    affiliation: str | None = None
    fillin_for: str | None = None  # for "in X's place, we have Y" — X
    source_text: str = ""


def _is_blacklisted(name: str) -> bool:
    return name.lower() in _NAME_BLACKLIST or len(name) < 3


def extract_intros(transcript_segments: list[dict]) -> list[SpeakerIntro]:
    """Walk transcript segments and extract speaker introduction events.

    Deduplicates intros within 5 seconds with the same (name, role_hint).
    """
    raw: list[SpeakerIntro] = []
    for seg in transcript_segments:
        text = seg.get("text", "")
        end = seg.get("end") or seg.get("start", 0)
        for pat, role, name_idx, affil_idx in _INTRO_PATTERNS:
            for m in pat.finditer(text):
                captured = m.group(name_idx)
                # Reject names that aren't capitalized in the source text —
                # eliminates mid-sentence lowercase matches like "and", "there"
                # that re.IGNORECASE picks up.
                if not captured or not captured[0].isupper():
                    continue
                name = captured[0].upper() + captured[1:].lower()
                if _is_blacklisted(name):
                    continue
                affiliation = None
                fillin_for = None
                if role == "guest" and affil_idx:
                    try:
                        affiliation = m.group(affil_idx).strip().rstrip(".,;").title()
                    except (IndexError, AttributeError):
                        affiliation = None
                if role == "fillin":
                    fillin_for = m.group(1).capitalize()
                raw.append(SpeakerIntro(
                    name=name,
                    intro_time=float(end),
                    role_hint=role,
                    affiliation=affiliation,
                    fillin_for=fillin_for,
                    source_text=text.strip(),
                ))
                break  # one intro per segment is plenty

    # Deduplicate: collapse same (name, role) within 5s window
    raw.sort(key=lambda x: x.intro_time)
    deduped: list[SpeakerIntro] = []
    for intro in raw:
        if deduped:
            prev = deduped[-1]
            if (prev.name == intro.name
                    and prev.role_hint == intro.role_hint
                    and abs(intro.intro_time - prev.intro_time) <= 5.0):
                continue
        deduped.append(intro)
    return deduped


@dataclass
class NamedTurn:
    speaker: str   # original diarizer label (HOST / CO-HOST / CALLER / BUMPER / UNKNOWN)
    name: str | None  # resolved name, or None
    role_hint: str | None  # "caller" / "guest" / "fillin" / etc.
    start: float
    end: float
    confidence: float
    intro_source: str | None = None  # transcript phrase that drove the resolution


# Allow an intro to bind to a turn that started slightly before it
# (Whisper segment boundaries vs diarizer turn boundaries don't always
# align; sometimes Mike's "let's talk to Kay" lands a few seconds after
# Kay's first audio frame).
_INTRO_FORWARD_TOLERANCE_S = 8.0
# For caller_close patterns, only bind if the close mention is within
# this many seconds AFTER the turn ends.
_CLOSE_LOOKAHEAD_S = 30.0


def resolve_speakers(diarization_turns: list[dict],
                     intros: list[SpeakerIntro]) -> list[NamedTurn]:
    """Assign speaker names to non-HOST diarization turns.

    Algorithm:
      For each non-HOST turn T:
        - Find the LATEST opening intro (caller / guest / fillin) at or
          before T.start (with 8s forward tolerance to handle boundary
          slop). No time limit — a later intro implicitly closes the
          previous caller, so the most recent one wins.
        - If no opening intro applies, look for a "thanks for the call X"
          closure within 30s after T ends.
        - If still none, leave the turn unresolved.
    """
    # Sort intros by time so we can walk through linearly
    opening = sorted(
        [i for i in intros if i.role_hint != "caller_close"],
        key=lambda i: i.intro_time,
    )
    closing = sorted(
        [i for i in intros if i.role_hint == "caller_close"],
        key=lambda i: i.intro_time,
    )

    out: list[NamedTurn] = []
    for turn in diarization_turns:
        speaker = turn["speaker"]
        start = turn["start"]
        end = turn["end"]
        confidence = turn.get("confidence", 0.0)

        if speaker in ("HOST", "BUMPER", "UNKNOWN"):
            out.append(NamedTurn(
                speaker=speaker, name=None, role_hint=None,
                start=start, end=end, confidence=confidence,
            ))
            continue

        # Latest opening intro at or before this turn (with forward slop)
        best: SpeakerIntro | None = None
        cutoff = start + _INTRO_FORWARD_TOLERANCE_S
        for intro in opening:
            if intro.intro_time <= cutoff:
                best = intro  # later intros override earlier
            else:
                break

        # If no opening, try a close pattern shortly AFTER turn ends
        if best is None:
            for intro in closing:
                if end <= intro.intro_time <= end + _CLOSE_LOOKAHEAD_S:
                    best = intro
                    break

        if best is not None:
            role = "caller" if best.role_hint == "caller_close" else best.role_hint
            out.append(NamedTurn(
                speaker=speaker,
                name=best.name,
                role_hint=role,
                start=start, end=end,
                confidence=confidence,
                intro_source=best.source_text[:80],
            ))
        else:
            out.append(NamedTurn(
                speaker=speaker, name=None, role_hint=None,
                start=start, end=end, confidence=confidence,
            ))

    return out


def speaker_at(time: float, intros: list[SpeakerIntro]) -> SpeakerIntro | None:
    """Return the active opening intro at the given time, or None.

    Same lookup logic as resolve_speakers but for a single timestamp,
    used to attach caller names to Q&A pairs by their question_start time.
    """
    opening = sorted(
        [i for i in intros if i.role_hint != "caller_close"],
        key=lambda i: i.intro_time,
    )
    best: SpeakerIntro | None = None
    cutoff = time + _INTRO_FORWARD_TOLERANCE_S
    for intro in opening:
        if intro.intro_time <= cutoff:
            best = intro
        else:
            break
    return best


def named_speaker_summary(named_turns: list[NamedTurn], duration: float) -> dict[str, float]:
    """Aggregate seconds-by-named-speaker for reporting."""
    times: dict[str, float] = {}
    for t in named_turns:
        if t.name:
            key = f"{t.role_hint}: {t.name}"
        else:
            key = t.speaker
        times[key] = times.get(key, 0) + (t.end - t.start)
    return dict(sorted(times.items(), key=lambda x: -x[1]))


def resolve_from_files(transcript_path: Path, diarization_path: Path) -> list[NamedTurn]:
    """Convenience wrapper: load JSONs, run extraction + resolution."""
    with open(transcript_path) as f:
        tdata = json.load(f)
    with open(diarization_path) as f:
        ddata = json.load(f)
    intros = extract_intros(tdata.get("segments", []))
    return resolve_speakers(ddata.get("turns", []), intros)