radio show: co-host voice profile, Q&A extraction fixes, archive index
- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
|
||||
(re.compile(r"\bcomputer running slow\b", re.I), 1),
|
||||
(re.compile(r"\bafter these messages\b", re.I), 1),
|
||||
(re.compile(r"\b790.?2040\b", re.I), 1),
|
||||
(re.compile(r"\b751.?1041\b", re.I), 1),
|
||||
(re.compile(r"\bgurushow\.com\b", re.I), 1),
|
||||
(re.compile(r"\bcall in now\b", re.I), 1),
|
||||
(re.compile(r"\bcomputer troubles\?", re.I), 1),
|
||||
(re.compile(r"\bhardware installation\b", re.I), 1),
|
||||
(re.compile(r"we.?ll get your problem solved", re.I), 1),
|
||||
]
|
||||
|
||||
|
||||
@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
|
||||
if _is_promo_or_bumper(turn["text"]):
|
||||
i += 1
|
||||
continue
|
||||
# Skip the opening 90s — real callers never call before the show starts
|
||||
if turn["start"] < 90:
|
||||
i += 1
|
||||
continue
|
||||
q_duration = turn["end"] - turn["start"]
|
||||
if q_duration < MIN_QUESTION_DURATION:
|
||||
i += 1
|
||||
continue
|
||||
# Require caller-intro context: host must have introduced the call, OR
|
||||
# the caller opens with a phone greeting ("hello", "hi", "hey")
|
||||
if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Look ahead for HOST answer turn(s)
|
||||
j = i + 1
|
||||
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
|
||||
with open(diarization_path) as f:
|
||||
diarization = json.load(f)
|
||||
|
||||
turns = diarization.get("turns", [])
|
||||
raw_turns = diarization.get("turns", [])
|
||||
|
||||
def speaker_at(t: float) -> str:
|
||||
"""Find which diarization turn covers time t."""
|
||||
# Resolve overlapping boundaries left by the sliding-window diarizer:
|
||||
# place each transition at the midpoint of the overlap region.
|
||||
resolved: list[dict] = []
|
||||
for turn in sorted(raw_turns, key=lambda t: t["start"]):
|
||||
if not resolved:
|
||||
resolved.append(dict(turn))
|
||||
continue
|
||||
prev = resolved[-1]
|
||||
if turn["start"] < prev["end"]:
|
||||
mid = (turn["start"] + prev["end"]) / 2
|
||||
prev["end"] = mid
|
||||
resolved.append({**turn, "start": mid})
|
||||
else:
|
||||
resolved.append(dict(turn))
|
||||
turns = resolved
|
||||
|
||||
# Minimum CALLER coverage to label a transcript segment as CALLER.
|
||||
# Batch transcription produces ~25s segments; caller windows are 10s.
|
||||
# Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
|
||||
_CALLER_MIN_S = 4.0
|
||||
|
||||
def speaker_for_segment(seg_start: float, seg_end: float) -> str:
|
||||
caller_cov = 0.0
|
||||
coverage: dict[str, float] = {}
|
||||
for turn in turns:
|
||||
if turn["start"] <= t <= turn["end"]:
|
||||
return turn["speaker"]
|
||||
return "UNKNOWN"
|
||||
overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
|
||||
if overlap <= 0:
|
||||
continue
|
||||
coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
|
||||
if turn["speaker"] == "CALLER":
|
||||
caller_cov += overlap
|
||||
if not coverage:
|
||||
return "UNKNOWN"
|
||||
if caller_cov >= _CALLER_MIN_S:
|
||||
return "CALLER"
|
||||
return max(coverage, key=coverage.__getitem__)
|
||||
|
||||
return [
|
||||
{"start": s["start"], "end": s["end"],
|
||||
"text": s["text"],
|
||||
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
|
||||
"speaker": speaker_for_segment(s["start"], s["end"])}
|
||||
for s in segments
|
||||
]
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
|
||||
"""Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
|
||||
host_count = 0
|
||||
for j in range(idx - 1, -1, -1):
|
||||
if turns[j]["speaker"] == "HOST":
|
||||
if _CALLER_INTRO.search(turns[j]["text"]):
|
||||
return True
|
||||
host_count += 1
|
||||
if host_count >= max_host_turns:
|
||||
break
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_question(text: str) -> bool:
|
||||
return bool(QUESTION_PATTERN.search(text))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user