radio show: co-host voice profile, Q&A extraction fixes, archive index
- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
25
projects/radio-show/audio-processor/.gitignore
vendored
Normal file
25
projects/radio-show/audio-processor/.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.venv/
|
||||
*.egg-info/
|
||||
|
||||
# Large data files
|
||||
test-data/episodes/
|
||||
test-data/transcripts/
|
||||
episodes/
|
||||
processed/
|
||||
|
||||
# Databases (regenerable)
|
||||
*.db
|
||||
*.sqlite
|
||||
|
||||
# Model cache
|
||||
.cache/
|
||||
*.pt
|
||||
*.bin
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
@@ -57,13 +57,15 @@ trans_results = []
|
||||
trans_total_audio = 0.0
|
||||
trans_total_wall = 0.0
|
||||
|
||||
import json
|
||||
from src.transcriber import transcribe as _transcribe
|
||||
|
||||
for ep in EPISODES:
|
||||
trans_ep_dir = TRANS_DIR / ep.stem
|
||||
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
||||
transcript_path = trans_ep_dir / "transcript.json"
|
||||
|
||||
if transcript_path.exists():
|
||||
import json
|
||||
with open(transcript_path) as f:
|
||||
td = json.load(f)
|
||||
dur = td.get("duration", 0)
|
||||
@@ -74,30 +76,15 @@ for ep in EPISODES:
|
||||
console.print(f" Transcribing {ep.name}...")
|
||||
t0 = time.monotonic()
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
if not hasattr(sys, "_whisper_model"):
|
||||
console.print(" [dim]Loading Whisper large-v3...[/dim]")
|
||||
sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16")
|
||||
|
||||
model = sys._whisper_model
|
||||
segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5)
|
||||
|
||||
import json
|
||||
segs = []
|
||||
for seg in segments_iter:
|
||||
segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text})
|
||||
|
||||
duration = info.duration
|
||||
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
||||
wall = time.monotonic() - t0
|
||||
rtf = duration / wall
|
||||
rtf = transcript.duration / wall
|
||||
|
||||
result = {"duration": duration, "language": "en", "segments": segs}
|
||||
with open(transcript_path, "w") as f:
|
||||
json.dump(result, f)
|
||||
transcript.save(trans_ep_dir)
|
||||
|
||||
console.print(f" [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
||||
trans_results.append((ep, transcript_path, duration, wall))
|
||||
trans_total_audio += duration
|
||||
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
||||
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
||||
trans_total_audio += transcript.duration
|
||||
trans_total_wall += wall
|
||||
|
||||
if trans_total_wall > 0:
|
||||
|
||||
115
projects/radio-show/audio-processor/build_cohost_profile.py
Normal file
115
projects/radio-show/audio-processor/build_cohost_profile.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Build voice profile for Tom (co-host) from known co-host speech windows.
|
||||
|
||||
Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
|
||||
before any real callers would have called in.
|
||||
"""
|
||||
import os, sys
|
||||
os.environ["PYTHONIOENCODING"] = "utf-8"
|
||||
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
|
||||
from pathlib import Path
|
||||
import json
|
||||
import numpy as np
|
||||
from src.gpu import ensure_cuda_libs
|
||||
ensure_cuda_libs()
|
||||
|
||||
import torch
|
||||
from src.voice_profiler import VoiceProfiler, SpeakerProfile
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
BASE = Path(__file__).parent
|
||||
PROFILES_DIR = BASE / "voice-profiles"
|
||||
EPISODES_DIR = BASE / "test-data" / "episodes"
|
||||
TRANS_DIR = BASE / "test-data" / "transcripts"
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
console.print(f"Device: {device}")
|
||||
|
||||
profiler = VoiceProfiler(PROFILES_DIR, device=device)
|
||||
|
||||
# Tom's known speech windows per episode
|
||||
# CALLER turns from diarization that are in the first 60 min (before real callers)
|
||||
# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
|
||||
TOM_WINDOWS = {
|
||||
"2014-s6e19.mp3": [
|
||||
(195, 260),
|
||||
(320, 425),
|
||||
(600, 650),
|
||||
(675, 710),
|
||||
],
|
||||
"2016-s8e43.mp3": [
|
||||
(100, 115),
|
||||
(135, 160),
|
||||
(270, 295),
|
||||
(575, 605),
|
||||
(1185, 1235),
|
||||
(1790, 1870),
|
||||
(2020, 2055),
|
||||
],
|
||||
}
|
||||
|
||||
COHOST_NAME = "Tom"
|
||||
|
||||
if COHOST_NAME not in profiler.profiles:
|
||||
profiler.profiles[COHOST_NAME] = SpeakerProfile(
|
||||
name=COHOST_NAME,
|
||||
role="cohost",
|
||||
embeddings=[],
|
||||
source_episodes=[],
|
||||
)
|
||||
|
||||
profile = profiler.profiles[COHOST_NAME]
|
||||
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
|
||||
|
||||
for ep_name, windows in TOM_WINDOWS.items():
|
||||
ep_path = EPISODES_DIR / ep_name
|
||||
if not ep_path.exists():
|
||||
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
|
||||
continue
|
||||
|
||||
console.print(f"\n Loading {ep_name}...")
|
||||
audio = profiler._load_full_audio(ep_path)
|
||||
profiler._get_model()
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
chunk_s = 10.0
|
||||
chunk_samples = int(chunk_s * SAMPLE_RATE)
|
||||
|
||||
for win_start, win_end in windows:
|
||||
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
|
||||
chunk_end = chunk_start + int(chunk_s)
|
||||
s = int(chunk_start * SAMPLE_RATE)
|
||||
e = s + chunk_samples
|
||||
if e > len(audio):
|
||||
break
|
||||
try:
|
||||
emb = profiler._embed_audio_np(audio[s:e])
|
||||
profile.embeddings.append(emb)
|
||||
console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]")
|
||||
except Exception as ex:
|
||||
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
|
||||
|
||||
profile.source_episodes.append(ep_name)
|
||||
|
||||
if not profile.embeddings:
|
||||
console.print("[red]No embeddings collected — check episode paths[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
profile.compute_composite()
|
||||
console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
|
||||
f"from {len(profile.source_episodes)} episodes[/green]")
|
||||
|
||||
# Verify: check cosine similarity vs Mike to ensure separation
|
||||
mike = profiler.profiles.get("Mike Swanson")
|
||||
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
|
||||
sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
|
||||
(np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
|
||||
console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")
|
||||
|
||||
profiler.save_profiles()
|
||||
console.print("[bold green]Profile saved.[/bold green]")
|
||||
102
projects/radio-show/audio-processor/index_test_episodes.py
Normal file
102
projects/radio-show/audio-processor/index_test_episodes.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""
|
||||
Index the 6 test episodes into archive.db.
|
||||
Reads pre-computed transcripts + diarization from test-data/transcripts/.
|
||||
"""
|
||||
import os, sys, re
|
||||
os.environ["PYTHONIOENCODING"] = "utf-8"
|
||||
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
|
||||
from pathlib import Path
|
||||
from src.indexer import ArchiveIndex
|
||||
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
BASE = Path(__file__).parent
|
||||
TRANS_DIR = BASE / "test-data" / "transcripts"
|
||||
EP_DIR = BASE / "test-data" / "episodes"
|
||||
DB_PATH = BASE / "archive.db"
|
||||
|
||||
_DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})")
|
||||
|
||||
|
||||
def parse_episode_meta(ep_id: str) -> tuple[str, int | None]:
|
||||
"""Return (date_str_or_year, hr) from episode directory name."""
|
||||
m = _DATE_RE.match(ep_id)
|
||||
if m:
|
||||
date = m.group(1)
|
||||
hr = int(ep_id[-1]) if ep_id.endswith(("-hr1", "-hr2")) else None
|
||||
return date, hr
|
||||
# season/episode format e.g. 2016-s8e43 — use year only
|
||||
year = ep_id[:4]
|
||||
return year, None
|
||||
|
||||
|
||||
console.print(f"\n[bold]Indexing test episodes into {DB_PATH.name}[/bold]")
|
||||
|
||||
with ArchiveIndex(DB_PATH) as idx:
|
||||
rows = []
|
||||
|
||||
for ep_dir in sorted(TRANS_DIR.iterdir()):
|
||||
t_path = ep_dir / "transcript.json"
|
||||
d_path = ep_dir / "diarization.json"
|
||||
if not t_path.exists():
|
||||
continue
|
||||
|
||||
ep_id = ep_dir.name
|
||||
date, hr = parse_episode_meta(ep_id)
|
||||
audio_path = EP_DIR / f"{ep_id}.mp3"
|
||||
|
||||
# Episode duration from transcript
|
||||
import json
|
||||
with open(t_path) as f:
|
||||
td = json.load(f)
|
||||
duration = td.get("duration", 0)
|
||||
|
||||
# Register episode
|
||||
idx.add_episode(
|
||||
episode_id=ep_id,
|
||||
audio_path=audio_path,
|
||||
date=date,
|
||||
duration=duration,
|
||||
hr=hr,
|
||||
)
|
||||
|
||||
# Load diarized segments and index
|
||||
segs = load_diarized_transcript(t_path, d_path if d_path.exists() else None)
|
||||
idx.add_segments(ep_id, segs)
|
||||
|
||||
# Extract and index Q&A pairs
|
||||
pairs = extract_qa_pairs(segs)
|
||||
for p in pairs:
|
||||
idx.add_qa_pair(
|
||||
episode_id=ep_id,
|
||||
q_start=p.question_start, q_end=p.question_end,
|
||||
a_start=p.answer_start, a_end=p.answer_end,
|
||||
question=p.question_text, answer=p.answer_text,
|
||||
topic=p.topic, tags=p.topic_tags,
|
||||
)
|
||||
|
||||
rows.append((ep_id, date, f"{duration:.0f}s", len(segs), len(pairs)))
|
||||
console.print(f" [green]{ep_id}[/green]: {len(segs)} segs, {len(pairs)} Q&A pairs")
|
||||
|
||||
stats = idx.stats()
|
||||
|
||||
table = Table(title="Index Summary")
|
||||
table.add_column("Episode")
|
||||
table.add_column("Date")
|
||||
table.add_column("Duration")
|
||||
table.add_column("Segments")
|
||||
table.add_column("Q&A")
|
||||
for ep_id, date, dur, segs, qa in rows:
|
||||
table.add_row(ep_id, date, dur, str(segs), str(qa))
|
||||
|
||||
console.print()
|
||||
console.print(table)
|
||||
console.print(f"\n[bold]DB totals:[/bold] {stats['episodes']} episodes, "
|
||||
f"{stats['segments']} segments, {stats['qa_pairs']} Q&A pairs")
|
||||
console.print(f"[dim]DB path: {DB_PATH}[/dim]")
|
||||
@@ -202,6 +202,8 @@ def diarize(audio_path: str | Path,
|
||||
label = seg.speaker_label.split(" (")[0] # strip confidence score
|
||||
if label.startswith("Host:") or label.startswith("Host "):
|
||||
speaker = "HOST"
|
||||
elif label.startswith("Cohost:"):
|
||||
speaker = "CO-HOST"
|
||||
elif label == "[error]":
|
||||
speaker = "UNKNOWN"
|
||||
else:
|
||||
|
||||
@@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [
|
||||
(re.compile(r"\bcomputer running slow\b", re.I), 1),
|
||||
(re.compile(r"\bafter these messages\b", re.I), 1),
|
||||
(re.compile(r"\b790.?2040\b", re.I), 1),
|
||||
(re.compile(r"\b751.?1041\b", re.I), 1),
|
||||
(re.compile(r"\bgurushow\.com\b", re.I), 1),
|
||||
(re.compile(r"\bcall in now\b", re.I), 1),
|
||||
(re.compile(r"\bcomputer troubles\?", re.I), 1),
|
||||
(re.compile(r"\bhardware installation\b", re.I), 1),
|
||||
(re.compile(r"we.?ll get your problem solved", re.I), 1),
|
||||
]
|
||||
|
||||
|
||||
@@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
|
||||
if _is_promo_or_bumper(turn["text"]):
|
||||
i += 1
|
||||
continue
|
||||
# Skip the opening 90s — real callers never call before the show starts
|
||||
if turn["start"] < 90:
|
||||
i += 1
|
||||
continue
|
||||
q_duration = turn["end"] - turn["start"]
|
||||
if q_duration < MIN_QUESTION_DURATION:
|
||||
i += 1
|
||||
continue
|
||||
# Require caller-intro context: host must have introduced the call, OR
|
||||
# the caller opens with a phone greeting ("hello", "hi", "hey")
|
||||
if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Look ahead for HOST answer turn(s)
|
||||
j = i + 1
|
||||
@@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path,
|
||||
with open(diarization_path) as f:
|
||||
diarization = json.load(f)
|
||||
|
||||
turns = diarization.get("turns", [])
|
||||
raw_turns = diarization.get("turns", [])
|
||||
|
||||
def speaker_at(t: float) -> str:
|
||||
"""Find which diarization turn covers time t."""
|
||||
# Resolve overlapping boundaries left by the sliding-window diarizer:
|
||||
# place each transition at the midpoint of the overlap region.
|
||||
resolved: list[dict] = []
|
||||
for turn in sorted(raw_turns, key=lambda t: t["start"]):
|
||||
if not resolved:
|
||||
resolved.append(dict(turn))
|
||||
continue
|
||||
prev = resolved[-1]
|
||||
if turn["start"] < prev["end"]:
|
||||
mid = (turn["start"] + prev["end"]) / 2
|
||||
prev["end"] = mid
|
||||
resolved.append({**turn, "start": mid})
|
||||
else:
|
||||
resolved.append(dict(turn))
|
||||
turns = resolved
|
||||
|
||||
# Minimum CALLER coverage to label a transcript segment as CALLER.
|
||||
# Batch transcription produces ~25s segments; caller windows are 10s.
|
||||
# Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed.
|
||||
_CALLER_MIN_S = 4.0
|
||||
|
||||
def speaker_for_segment(seg_start: float, seg_end: float) -> str:
|
||||
caller_cov = 0.0
|
||||
coverage: dict[str, float] = {}
|
||||
for turn in turns:
|
||||
if turn["start"] <= t <= turn["end"]:
|
||||
return turn["speaker"]
|
||||
overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"])
|
||||
if overlap <= 0:
|
||||
continue
|
||||
coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap
|
||||
if turn["speaker"] == "CALLER":
|
||||
caller_cov += overlap
|
||||
if not coverage:
|
||||
return "UNKNOWN"
|
||||
if caller_cov >= _CALLER_MIN_S:
|
||||
return "CALLER"
|
||||
return max(coverage, key=coverage.__getitem__)
|
||||
|
||||
return [
|
||||
{"start": s["start"], "end": s["end"],
|
||||
"text": s["text"],
|
||||
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
|
||||
"speaker": speaker_for_segment(s["start"], s["end"])}
|
||||
for s in segments
|
||||
]
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool:
|
||||
"""Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase."""
|
||||
host_count = 0
|
||||
for j in range(idx - 1, -1, -1):
|
||||
if turns[j]["speaker"] == "HOST":
|
||||
if _CALLER_INTRO.search(turns[j]["text"]):
|
||||
return True
|
||||
host_count += 1
|
||||
if host_count >= max_host_turns:
|
||||
break
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_question(text: str) -> bool:
|
||||
return bool(QUESTION_PATTERN.search(text))
|
||||
|
||||
|
||||
@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:
|
||||
|
||||
|
||||
def transcribe(audio_path: str | Path, model_size: str = "large-v3",
|
||||
language: str = "en", device: str = "cuda") -> Transcript:
|
||||
"""Transcribe an audio file using faster-whisper."""
|
||||
from faster_whisper import WhisperModel
|
||||
language: str = "en", device: str = "cuda",
|
||||
batch_size: int = 16) -> Transcript:
|
||||
"""Transcribe an audio file using faster-whisper.
|
||||
|
||||
Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
|
||||
Word timestamps are skipped in batch mode (not needed for segment-level search).
|
||||
Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
|
||||
"""
|
||||
from faster_whisper import WhisperModel, BatchedInferencePipeline
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
use_batched = batch_size > 0
|
||||
|
||||
console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
|
||||
console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
|
||||
console.print(
|
||||
f"[dim]Model: {model_size} | "
|
||||
f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
|
||||
f"Device: {device}[/dim]"
|
||||
)
|
||||
|
||||
if use_batched:
|
||||
base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
|
||||
model = BatchedInferencePipeline(model=base_model)
|
||||
segments_raw, info = model.transcribe(
|
||||
str(audio_path),
|
||||
language=language,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
else:
|
||||
model = WhisperModel(model_size, device=device, compute_type="float16")
|
||||
|
||||
segments_raw, info = model.transcribe(
|
||||
str(audio_path),
|
||||
language=language,
|
||||
word_timestamps=True,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500,
|
||||
speech_pad_ms=200,
|
||||
),
|
||||
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
|
||||
)
|
||||
|
||||
console.print(f"[dim]Detected language: {info.language} "
|
||||
f"(probability: {info.language_probability:.2f})[/dim]")
|
||||
console.print(f"[dim]Duration: {info.duration:.1f}s "
|
||||
f"({info.duration / 60:.1f} min)[/dim]")
|
||||
console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")
|
||||
|
||||
segments = []
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed} segments"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Transcribing...", total=None)
|
||||
|
||||
for i, seg in enumerate(segments_raw):
|
||||
words = []
|
||||
if not use_batched:
|
||||
words = [
|
||||
TranscriptWord(
|
||||
word=w.word,
|
||||
start=w.start,
|
||||
end=w.end,
|
||||
probability=w.probability,
|
||||
)
|
||||
TranscriptWord(word=w.word, start=w.start,
|
||||
end=w.end, probability=w.probability)
|
||||
for w in (seg.words or [])
|
||||
]
|
||||
segments.append(TranscriptSegment(
|
||||
id=i,
|
||||
text=seg.text,
|
||||
start=seg.start,
|
||||
end=seg.end,
|
||||
words=words,
|
||||
id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
|
||||
))
|
||||
progress.update(task, completed=i + 1)
|
||||
if i % 50 == 0:
|
||||
console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]")
|
||||
|
||||
console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")
|
||||
|
||||
|
||||
@@ -319,8 +319,11 @@ class VoiceProfiler:
|
||||
best_match = name
|
||||
|
||||
if best_score >= threshold:
|
||||
if best_match and self.profiles[best_match].role == "host":
|
||||
role = self.profiles[best_match].role if best_match else "unknown"
|
||||
if role == "host":
|
||||
label = f"Host: {best_match}"
|
||||
elif role == "cohost":
|
||||
label = f"Cohost: {best_match}"
|
||||
else:
|
||||
label = best_match
|
||||
else:
|
||||
|
||||
@@ -22,5 +22,13 @@
|
||||
"2018-s10e17.mp3",
|
||||
"2018-s10e21.mp3"
|
||||
]
|
||||
},
|
||||
"Tom": {
|
||||
"role": "cohost",
|
||||
"num_samples": 44,
|
||||
"source_episodes": [
|
||||
"2014-s6e19.mp3",
|
||||
"2016-s8e43.mp3"
|
||||
]
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,251 @@
|
||||
# Session Log: Q&A Extraction — Co-Host Profile + Archive Indexing
|
||||
**Date:** 2026-04-27
|
||||
**Project:** Radio Show Archive Mining — Computer Guru Show
|
||||
|
||||
---
|
||||
|
||||
## User
|
||||
- **User:** Mike Swanson (mike)
|
||||
- **Machine:** DESKTOP-0O8A1RL
|
||||
- **Role:** admin
|
||||
|
||||
---
|
||||
|
||||
## Session Summary
|
||||
|
||||
The session began with resuming work following a benchmark run that demonstrated a significant performance improvement in Whisper transcription, achieving 63.8x real-time speed with batched inference and int8_float16 settings. Next, the focus shifted to evaluating the quality of Q&A extraction across six test episodes, revealing a critical issue with false positives due to co-host Tom being mislabeled as CALLER based on a voice similarity threshold.
|
||||
|
||||
A co-host voice profile for Tom was constructed using 44 embeddings from two specific episodes (2014-s6e19 and 2016-s8e43), producing a cosine similarity of 0.698 against Mike — well below Mike's 0.85 threshold, giving clean separation. Code was updated in `voice_profiler.py` and `diarizer.py` to correctly emit "Cohost: Tom" labels and map them to a new "CO-HOST" speaker tag. Re-diarizing the two co-host-era episodes dramatically cleaned up Q&A results: 2016 went from 12 false positives to 2 real WiFi caller pairs.
|
||||
|
||||
Several bugs in `qa_extractor.py` were fixed: overlap resolution for sliding-window diarization boundaries, CALLER-preference threshold for long batch transcript segments, and a turn-based caller-intro lookback to replace an ineffective 120s time window. Phone-greeting detection and new promo signatures were added. The final Q&A count landed at 10 pairs across 6 episodes, with 2014 correctly yielding 0 (gaming co-host episode with no actual callers).
|
||||
|
||||
`archive.db` was created with the ArchiveIndex schema (episodes, segments, segments_fts, qa_pairs, qa_fts). All 6 test episodes were indexed: 762 segments, 10 Q&A pairs. FTS5 search verified working for "router", "Windows 10", "Internet Explorer", "antivirus", and "connect" queries.
|
||||
|
||||
---
|
||||
|
||||
## Key Decisions
|
||||
|
||||
- **Co-host threshold uses same 0.85 bar as host**: Tom scores 0.698 vs Mike. Any voice >= 0.85 against Tom's composite gets labeled CO-HOST. Keeps the same single threshold for all profiles rather than per-profile thresholds.
|
||||
- **Turn-based lookback for caller-intro (2 HOST turns, not 120s)**: Long HOST monologue blocks (8-10 min) in big show segments meant time-based lookback missed the caller introduction. Previous 2 HOST turns always catches it regardless of block length.
|
||||
- **CALLER-preference at 4s minimum overlap**: Batch transcription produces ~26s segments; diarization CALLER windows are ~10s. Pure majority-vote always gave HOST. 4s minimum CALLER coverage labels the segment CALLER without being overly aggressive for co-host episodes.
|
||||
- **Midpoint boundary resolution at load time**: Rather than re-diarizing everything, the sliding-window overlap is resolved in `load_diarized_transcript()` so it applies retroactively to all saved diarization files without touching the JSON.
|
||||
- **751-1041 added as promo signal**: Earlier Tucson show number (vs 790-2040 in later seasons). Weighted 1 (needs a second semi-generic signal to filter).
|
||||
- **Tom's windows sourced from first 60 min of co-host episodes**: Real callers don't call in during the first hour of a 2-hour show (only exceptions: very end of show). First-hour CALLER windows are safely all Tom.
|
||||
|
||||
---
|
||||
|
||||
## Problems Encountered
|
||||
|
||||
- **2016-s8e43 had 12 Q&A pairs, 11 false positives**: Root cause was Tom (co-host) labeled CALLER throughout. Fixed by building Tom's voice profile and re-diarizing.
|
||||
- **2014-s6e19 had 2 Q&A pairs from gaming discussion**: Same co-host issue. After re-diarization: 0 pairs (correct — no actual callers in that gaming special).
|
||||
- **2012-03-10 yielded 0 segments labeled CALLER**: Midpoint assignment hit HOST turns (HOST 0-20s and CALLER 15-30s — midpoint 15.1s falls in HOST). Fixed by overlap-preference assignment with 4s CALLER minimum.
|
||||
- **Real WiFi caller (2016, ~4794s) was missing after first fix attempt**: Aggressive time-based lookback (120s) combined with short CALLER turns from sliding-window diarization caused the caller question to land in a HOST segment. Fixed by turn-based lookback + co-host profile (eliminated Tom noise, letting real caller windows survive).
|
||||
- **2012-Jun pair at 1325s was a promo**: "The Computer Guru. We'll get your problem solved. Call 751-1041 today" passed promo filter. Fixed by adding 751-1041 and "we'll get your problem solved" as promo signatures.
|
||||
|
||||
---
|
||||
|
||||
## Files Created / Modified
|
||||
|
||||
### New files
|
||||
```
|
||||
projects/radio-show/audio-processor/build_cohost_profile.py
|
||||
projects/radio-show/audio-processor/index_test_episodes.py
|
||||
projects/radio-show/audio-processor/archive.db
|
||||
projects/radio-show/audio-processor/voice-profiles/tom/
|
||||
projects/radio-show/audio-processor/voice-profiles/profiles.json (updated: Tom added)
|
||||
projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md (this file)
|
||||
```
|
||||
|
||||
### Modified
|
||||
```
|
||||
src/voice_profiler.py — emit "Cohost: <name>" label for cohost role
|
||||
src/diarizer.py — map "Cohost:" prefix to "CO-HOST" speaker
|
||||
src/qa_extractor.py — overlap resolution, CALLER-preference, turn-based
|
||||
caller-intro lookback, _preceded_by_caller_intro(),
|
||||
_PHONE_GREETING, 751-1041 + promo sig additions
|
||||
test-data/transcripts/2014-s6e19/diarization.json (re-diarized with Tom profile)
|
||||
test-data/transcripts/2016-s8e43/diarization.json (re-diarized with Tom profile)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Results (from previous run — baseline for BEAST comparison)
|
||||
|
||||
**Machine:** DESKTOP-0O8A1RL — NVIDIA GeForce RTX 5070 Ti Laptop GPU
|
||||
|
||||
| Episode | Audio | Wall (diarize) | RTF |
|
||||
|---------|-------|----------------|-----|
|
||||
| 2011-03-12-hr1 | 2509s | 15.1s | 166.1x |
|
||||
| 2012-03-10-hr1 | 2634s | 12.2s | 215.5x |
|
||||
| 2012-06-09-hr1 | 2648s | 12.2s | 216.8x |
|
||||
| 2014-s6e19 | 2914s | 13.4s | 216.9x |
|
||||
| 2016-s8e43 | 5326s | 24.2s | 219.6x |
|
||||
| 2017-s9e30 | 5343s | 24.7s | 216.4x |
|
||||
| **TOTAL** | **21374s** | **101.9s** | **209.7x** |
|
||||
|
||||
Transcription (batched Whisper large-v3): 63.8x realtime
|
||||
Diarization: 209.7x realtime
|
||||
vs DESKTOP-0O8A1RL baseline (149.5x): **+60.2x (+40.3%)**
|
||||
|
||||
---
|
||||
|
||||
## Archive DB State
|
||||
|
||||
**Path:** `projects/radio-show/audio-processor/archive.db`
|
||||
|
||||
```
|
||||
Episodes : 6
|
||||
Segments : 762
|
||||
Q&A pairs: 10
|
||||
```
|
||||
|
||||
**Q&A pairs by episode:**
|
||||
| Episode | Pairs | Notes |
|
||||
|---------|-------|-------|
|
||||
| 2011-03-12-hr1 | 3 | IE lockout call, cloud computing, ghost hunting caller |
|
||||
| 2012-03-10-hr1 | 1 | iPad 3 discussion |
|
||||
| 2012-06-09-hr1 | 1 | Windows repair feature call |
|
||||
| 2014-s6e19 | 0 | Gaming co-host special — no actual callers |
|
||||
| 2016-s8e43 | 2 | WiFi connectivity caller (2 turns of same call) |
|
||||
| 2017-s9e30 | 3 | Software control, Cat5 cabling (Charlie), WiFi ports |
|
||||
|
||||
---
|
||||
|
||||
## Voice Profiles State
|
||||
|
||||
**Path:** `projects/radio-show/audio-processor/voice-profiles/`
|
||||
|
||||
| Name | Role | Embeddings | Source Episodes |
|
||||
|------|------|-----------|-----------------|
|
||||
| Mike Swanson | host | 180 | 9 episodes (2010-2018) |
|
||||
| Tom | cohost | 44 | 2014-s6e19, 2016-s8e43 |
|
||||
|
||||
Tom vs Mike cosine similarity: **0.698** (well-separated at 0.85 threshold)
|
||||
|
||||
**Tom's source windows used:**
|
||||
- 2014-s6e19: 195-260s, 320-425s, 600-650s, 675-710s
|
||||
- 2016-s8e43: 100-115s, 135-160s, 270-295s, 575-605s, 1185-1235s, 1790-1870s, 2020-2055s
|
||||
|
||||
---
|
||||
|
||||
## Co-Host Era Notes
|
||||
|
||||
Tom was the regular in-studio co-host/board-op roughly 2013-2016. His voice is in episodes from at least 2014 through 2016 (confirmed from test set). The 2011 and 2012 episodes are pure call-in format with no co-host.
|
||||
|
||||
If there are occasional guest co-hosts or fill-in hosts in other years, they would still be labeled CALLER until profiled. These would be rare and would likely not form question patterns that survive the caller-intro gate.
|
||||
|
||||
---
|
||||
|
||||
## Pending Tasks for BEAST (GURU-BEAST-ROG)
|
||||
|
||||
### 1. Run benchmark.py to establish RTX 4090 baseline
|
||||
|
||||
```bash
|
||||
cd D:/claudetools/projects/radio-show/audio-processor
|
||||
.venv/Scripts/python benchmark.py 2>&1 | tee bench-4090.txt
|
||||
```
|
||||
|
||||
BENCH_SETUP.md has all setup steps. The voice profiles are in `voice-profiles/` (already copied or available via Tailscale/robocopy from DESKTOP-0O8A1RL). Test episodes go in `test-data/episodes/`.
|
||||
|
||||
Expected: diarization RTF should be ~250-300x on RTX 4090 (vs 209.7x on laptop 5070 Ti). Transcription should be ~70-80x.
|
||||
|
||||
Update `benchmark.py` line 27 after measuring:
|
||||
```python
|
||||
BASELINE_RTF = 209.7 # current laptop 5070 Ti baseline
|
||||
```
|
||||
|
||||
### 2. Download full archive from IX server (172.16.3.10)
|
||||
|
||||
Use paramiko (SSH with key agent disabled):
|
||||
```python
|
||||
import paramiko
|
||||
ssh = paramiko.SSHClient()
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
ssh.connect("172.16.3.10", username="gurushow", password="<from vault>",
|
||||
look_for_keys=False, allow_agent=False)
|
||||
```
|
||||
|
||||
Archive path: `/home/gurushow/public_html/archive/Radio/`
|
||||
Episode count: 579 MP3s across 2010-2018 (no 2013 season)
|
||||
Approximate total size: ~30-40 GB
|
||||
|
||||
Download script skeleton in prior session log: `2026-04-27-diarization-pipeline.md`
|
||||
|
||||
**Tailscale required** — IX server is at 172.16.3.10, requires VPN.
|
||||
|
||||
### 3. Full archive processing
|
||||
|
||||
Once episodes are downloaded:
|
||||
|
||||
```bash
|
||||
# Transcribe + diarize all episodes
|
||||
cd D:/claudetools/projects/radio-show/audio-processor
|
||||
.venv/Scripts/python diarize_training.py # or a new batch_process_all.py
|
||||
|
||||
# Index everything into archive.db
|
||||
.venv/Scripts/python index_test_episodes.py # modify to point at full episodes dir
|
||||
```
|
||||
|
||||
The pipeline is idempotent — `add_segments()` skips episodes already indexed.
|
||||
|
||||
### 4. Verify co-host era episodes
|
||||
|
||||
2013-2016 era episodes should now correctly separate Tom (CO-HOST) from actual callers. Spot-check a few 2015 episodes after processing to confirm Tom's profile generalizes well.
|
||||
|
||||
If any 2015/2016 episodes show too many CALLER turns that are clearly Tom (voice changed slightly over years), re-run `build_cohost_profile.py` with windows from that episode added to TOM_WINDOWS dict.
|
||||
|
||||
---
|
||||
|
||||
## Technical Reference
|
||||
|
||||
### Key thresholds
|
||||
|
||||
```python
|
||||
host_match_threshold = 0.85 # WavLM cosine similarity — applied to ALL profiles
|
||||
CALLER_MIN_S = 4.0 # min CALLER coverage in transcript segment to label CALLER
|
||||
PROMO_SCORE_THRESHOLD = 2 # weighted promo signature score
|
||||
MIN_QUESTION_DURATION = 5.0 # seconds
|
||||
MIN_ANSWER_DURATION = 15.0 # seconds
|
||||
MAX_GAP_BETWEEN_QA = 30.0 # seconds
|
||||
```
|
||||
|
||||
### Diarization sliding window
|
||||
|
||||
```python
|
||||
window_s = 10.0 # 10s embedding windows
|
||||
hop_s = 5.0 # 5s hop → overlapping boundaries (resolved at load time)
|
||||
```
|
||||
|
||||
### Transcription (batch mode)
|
||||
|
||||
```python
|
||||
model_size = "large-v3"
|
||||
compute_type = "int8_float16"
|
||||
batch_size = 16
|
||||
# No word timestamps in batch mode (not needed for search/diarization)
|
||||
```
|
||||
|
||||
### DB search examples
|
||||
|
||||
```python
|
||||
from src.indexer import ArchiveIndex
|
||||
from pathlib import Path
|
||||
|
||||
with ArchiveIndex(Path("archive.db")) as idx:
|
||||
# Segment search
|
||||
results = idx.search("router", limit=20)
|
||||
results = idx.search("Windows 10", speaker_filter="HOST", limit=10)
|
||||
|
||||
# Q&A search
|
||||
qa = idx.search_qa("antivirus", limit=10)
|
||||
qa = idx.search_qa("wifi connect", limit=10)
|
||||
```
|
||||
|
||||
### Archive server
|
||||
|
||||
```
|
||||
Host: 172.16.3.10 (requires Tailscale)
|
||||
User: gurushow
|
||||
Archive root: /home/gurushow/public_html/archive/Radio/
|
||||
SSH: paramiko with look_for_keys=False, allow_agent=False
|
||||
```
|
||||
Reference in New Issue
Block a user