diff --git a/projects/radio-show/audio-processor/.gitignore b/projects/radio-show/audio-processor/.gitignore new file mode 100644 index 0000000..029d309 --- /dev/null +++ b/projects/radio-show/audio-processor/.gitignore @@ -0,0 +1,25 @@ +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +*.egg-info/ + +# Large data files +test-data/episodes/ +test-data/transcripts/ +episodes/ +processed/ + +# Databases (regenerable) +*.db +*.sqlite + +# Model cache +.cache/ +*.pt +*.bin + +# OS +.DS_Store +Thumbs.db diff --git a/projects/radio-show/audio-processor/benchmark.py b/projects/radio-show/audio-processor/benchmark.py index 000470d..650af31 100644 --- a/projects/radio-show/audio-processor/benchmark.py +++ b/projects/radio-show/audio-processor/benchmark.py @@ -57,13 +57,15 @@ trans_results = [] trans_total_audio = 0.0 trans_total_wall = 0.0 +import json +from src.transcriber import transcribe as _transcribe + for ep in EPISODES: trans_ep_dir = TRANS_DIR / ep.stem trans_ep_dir.mkdir(parents=True, exist_ok=True) transcript_path = trans_ep_dir / "transcript.json" if transcript_path.exists(): - import json with open(transcript_path) as f: td = json.load(f) dur = td.get("duration", 0) @@ -74,30 +76,15 @@ for ep in EPISODES: console.print(f" Transcribing {ep.name}...") t0 = time.monotonic() - from faster_whisper import WhisperModel - if not hasattr(sys, "_whisper_model"): - console.print(" [dim]Loading Whisper large-v3...[/dim]") - sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16") - - model = sys._whisper_model - segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5) - - import json - segs = [] - for seg in segments_iter: - segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text}) - - duration = info.duration + transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16) wall = time.monotonic() - t0 - rtf = duration / wall + rtf = transcript.duration / wall - result = {"duration": duration, "language": "en", "segments": segs} - with open(transcript_path, "w") as f: - json.dump(result, f) + transcript.save(trans_ep_dir) - console.print(f" [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]") - trans_results.append((ep, transcript_path, duration, wall)) - trans_total_audio += duration + console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]") + trans_results.append((ep, transcript_path, transcript.duration, wall)) + trans_total_audio += transcript.duration trans_total_wall += wall if trans_total_wall > 0: diff --git a/projects/radio-show/audio-processor/build_cohost_profile.py b/projects/radio-show/audio-processor/build_cohost_profile.py new file mode 100644 index 0000000..50c2d2b --- /dev/null +++ b/projects/radio-show/audio-processor/build_cohost_profile.py @@ -0,0 +1,115 @@ +""" +Build voice profile for Tom (co-host) from known co-host speech windows. + +Uses CALLER-labeled windows from the first 60 min of co-host-era episodes, +before any real callers would have called in. +""" +import os, sys +os.environ["PYTHONIOENCODING"] = "utf-8" +os.environ["TRANSFORMERS_OFFLINE"] = "1" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") + +from pathlib import Path +import json +import numpy as np +from src.gpu import ensure_cuda_libs +ensure_cuda_libs() + +import torch +from src.voice_profiler import VoiceProfiler, SpeakerProfile +from rich.console import Console + +console = Console() + +BASE = Path(__file__).parent +PROFILES_DIR = BASE / "voice-profiles" +EPISODES_DIR = BASE / "test-data" / "episodes" +TRANS_DIR = BASE / "test-data" / "transcripts" + +device = "cuda" if torch.cuda.is_available() else "cpu" +console.print(f"Device: {device}") + +profiler = VoiceProfiler(PROFILES_DIR, device=device) + +# Tom's known speech windows per episode +# CALLER turns from diarization that are in the first 60 min (before real callers) +# Windows at 0-40s excluded (promo/jingle, not Tom's voice) +TOM_WINDOWS = { + "2014-s6e19.mp3": [ + (195, 260), + (320, 425), + (600, 650), + (675, 710), + ], + "2016-s8e43.mp3": [ + (100, 115), + (135, 160), + (270, 295), + (575, 605), + (1185, 1235), + (1790, 1870), + (2020, 2055), + ], +} + +COHOST_NAME = "Tom" + +if COHOST_NAME not in profiler.profiles: + profiler.profiles[COHOST_NAME] = SpeakerProfile( + name=COHOST_NAME, + role="cohost", + embeddings=[], + source_episodes=[], + ) + +profile = profiler.profiles[COHOST_NAME] +console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]") + +for ep_name, windows in TOM_WINDOWS.items(): + ep_path = EPISODES_DIR / ep_name + if not ep_path.exists(): + console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]") + continue + + console.print(f"\n Loading {ep_name}...") + audio = profiler._load_full_audio(ep_path) + profiler._get_model() + + SAMPLE_RATE = 16000 + chunk_s = 10.0 + chunk_samples = int(chunk_s * SAMPLE_RATE) + + for win_start, win_end in windows: + for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)): + chunk_end = chunk_start + int(chunk_s) + s = int(chunk_start * SAMPLE_RATE) + e = s + chunk_samples + if e > len(audio): + break + try: + emb = profiler._embed_audio_np(audio[s:e]) + profile.embeddings.append(emb) + console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]") + except Exception as ex: + console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]") + + profile.source_episodes.append(ep_name) + +if not profile.embeddings: + console.print("[red]No embeddings collected — check episode paths[/red]") + sys.exit(1) + +profile.compute_composite() +console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings " + f"from {len(profile.source_episodes)} episodes[/green]") + +# Verify: check cosine similarity vs Mike to ensure separation +mike = profiler.profiles.get("Mike Swanson") +if mike and mike.composite_embedding is not None and profile.composite_embedding is not None: + sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) / + (np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8)) + console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)") + +profiler.save_profiles() +console.print("[bold green]Profile saved.[/bold green]") diff --git a/projects/radio-show/audio-processor/index_test_episodes.py b/projects/radio-show/audio-processor/index_test_episodes.py new file mode 100644 index 0000000..6e96441 --- /dev/null +++ b/projects/radio-show/audio-processor/index_test_episodes.py @@ -0,0 +1,102 @@ +""" +Index the 6 test episodes into archive.db. +Reads pre-computed transcripts + diarization from test-data/transcripts/. +""" +import os, sys, re +os.environ["PYTHONIOENCODING"] = "utf-8" +os.environ["TRANSFORMERS_OFFLINE"] = "1" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") + +from pathlib import Path +from src.indexer import ArchiveIndex +from src.qa_extractor import load_diarized_transcript, extract_qa_pairs +from rich.console import Console +from rich.table import Table + +console = Console() + +BASE = Path(__file__).parent +TRANS_DIR = BASE / "test-data" / "transcripts" +EP_DIR = BASE / "test-data" / "episodes" +DB_PATH = BASE / "archive.db" + +_DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})") + + +def parse_episode_meta(ep_id: str) -> tuple[str, int | None]: + """Return (date_str_or_year, hr) from episode directory name.""" + m = _DATE_RE.match(ep_id) + if m: + date = m.group(1) + hr = int(ep_id[-1]) if ep_id.endswith(("-hr1", "-hr2")) else None + return date, hr + # season/episode format e.g. 2016-s8e43 — use year only + year = ep_id[:4] + return year, None + + +console.print(f"\n[bold]Indexing test episodes into {DB_PATH.name}[/bold]") + +with ArchiveIndex(DB_PATH) as idx: + rows = [] + + for ep_dir in sorted(TRANS_DIR.iterdir()): + t_path = ep_dir / "transcript.json" + d_path = ep_dir / "diarization.json" + if not t_path.exists(): + continue + + ep_id = ep_dir.name + date, hr = parse_episode_meta(ep_id) + audio_path = EP_DIR / f"{ep_id}.mp3" + + # Episode duration from transcript + import json + with open(t_path) as f: + td = json.load(f) + duration = td.get("duration", 0) + + # Register episode + idx.add_episode( + episode_id=ep_id, + audio_path=audio_path, + date=date, + duration=duration, + hr=hr, + ) + + # Load diarized segments and index + segs = load_diarized_transcript(t_path, d_path if d_path.exists() else None) + idx.add_segments(ep_id, segs) + + # Extract and index Q&A pairs + pairs = extract_qa_pairs(segs) + for p in pairs: + idx.add_qa_pair( + episode_id=ep_id, + q_start=p.question_start, q_end=p.question_end, + a_start=p.answer_start, a_end=p.answer_end, + question=p.question_text, answer=p.answer_text, + topic=p.topic, tags=p.topic_tags, + ) + + rows.append((ep_id, date, f"{duration:.0f}s", len(segs), len(pairs))) + console.print(f" [green]{ep_id}[/green]: {len(segs)} segs, {len(pairs)} Q&A pairs") + + stats = idx.stats() + +table = Table(title="Index Summary") +table.add_column("Episode") +table.add_column("Date") +table.add_column("Duration") +table.add_column("Segments") +table.add_column("Q&A") +for ep_id, date, dur, segs, qa in rows: + table.add_row(ep_id, date, dur, str(segs), str(qa)) + +console.print() +console.print(table) +console.print(f"\n[bold]DB totals:[/bold] {stats['episodes']} episodes, " + f"{stats['segments']} segments, {stats['qa_pairs']} Q&A pairs") +console.print(f"[dim]DB path: {DB_PATH}[/dim]") diff --git a/projects/radio-show/audio-processor/src/diarizer.py b/projects/radio-show/audio-processor/src/diarizer.py index 7c5a98a..340ea4d 100644 --- a/projects/radio-show/audio-processor/src/diarizer.py +++ b/projects/radio-show/audio-processor/src/diarizer.py @@ -202,6 +202,8 @@ def diarize(audio_path: str | Path, label = seg.speaker_label.split(" (")[0] # strip confidence score if label.startswith("Host:") or label.startswith("Host "): speaker = "HOST" + elif label.startswith("Cohost:"): + speaker = "CO-HOST" elif label == "[error]": speaker = "UNKNOWN" else: diff --git a/projects/radio-show/audio-processor/src/qa_extractor.py b/projects/radio-show/audio-processor/src/qa_extractor.py index 8f307a2..06eb3d9 100644 --- a/projects/radio-show/audio-processor/src/qa_extractor.py +++ b/projects/radio-show/audio-processor/src/qa_extractor.py @@ -53,10 +53,12 @@ _PROMO_SIGS: list[tuple[re.Pattern, int]] = [ (re.compile(r"\bcomputer running slow\b", re.I), 1), (re.compile(r"\bafter these messages\b", re.I), 1), (re.compile(r"\b790.?2040\b", re.I), 1), + (re.compile(r"\b751.?1041\b", re.I), 1), (re.compile(r"\bgurushow\.com\b", re.I), 1), (re.compile(r"\bcall in now\b", re.I), 1), (re.compile(r"\bcomputer troubles\?", re.I), 1), (re.compile(r"\bhardware installation\b", re.I), 1), + (re.compile(r"we.?ll get your problem solved", re.I), 1), ] @@ -127,10 +129,19 @@ def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]: if _is_promo_or_bumper(turn["text"]): i += 1 continue + # Skip the opening 90s — real callers never call before the show starts + if turn["start"] < 90: + i += 1 + continue q_duration = turn["end"] - turn["start"] if q_duration < MIN_QUESTION_DURATION: i += 1 continue + # Require caller-intro context: host must have introduced the call, OR + # the caller opens with a phone greeting ("hello", "hi", "hey") + if not _preceded_by_caller_intro(turns, i) and not _PHONE_GREETING.match(turn["text"].strip()): + i += 1 + continue # Look ahead for HOST answer turn(s) j = i + 1 @@ -329,25 +340,71 @@ def load_diarized_transcript(transcript_path: Path, with open(diarization_path) as f: diarization = json.load(f) - turns = diarization.get("turns", []) + raw_turns = diarization.get("turns", []) - def speaker_at(t: float) -> str: - """Find which diarization turn covers time t.""" + # Resolve overlapping boundaries left by the sliding-window diarizer: + # place each transition at the midpoint of the overlap region. + resolved: list[dict] = [] + for turn in sorted(raw_turns, key=lambda t: t["start"]): + if not resolved: + resolved.append(dict(turn)) + continue + prev = resolved[-1] + if turn["start"] < prev["end"]: + mid = (turn["start"] + prev["end"]) / 2 + prev["end"] = mid + resolved.append({**turn, "start": mid}) + else: + resolved.append(dict(turn)) + turns = resolved + + # Minimum CALLER coverage to label a transcript segment as CALLER. + # Batch transcription produces ~25s segments; caller windows are 10s. + # Require 4s of CALLER overlap so brief HOST-edge segments aren't over-claimed. + _CALLER_MIN_S = 4.0 + + def speaker_for_segment(seg_start: float, seg_end: float) -> str: + caller_cov = 0.0 + coverage: dict[str, float] = {} for turn in turns: - if turn["start"] <= t <= turn["end"]: - return turn["speaker"] - return "UNKNOWN" + overlap = min(seg_end, turn["end"]) - max(seg_start, turn["start"]) + if overlap <= 0: + continue + coverage[turn["speaker"]] = coverage.get(turn["speaker"], 0) + overlap + if turn["speaker"] == "CALLER": + caller_cov += overlap + if not coverage: + return "UNKNOWN" + if caller_cov >= _CALLER_MIN_S: + return "CALLER" + return max(coverage, key=coverage.__getitem__) return [ {"start": s["start"], "end": s["end"], "text": s["text"], - "speaker": speaker_at((s["start"] + s["end"]) / 2)} + "speaker": speaker_for_segment(s["start"], s["end"])} for s in segments ] # ── Helpers ──────────────────────────────────────────────────────────────── +_PHONE_GREETING = re.compile(r"^(hello|hi|hey|good (morning|afternoon|evening))\b", re.IGNORECASE) + + +def _preceded_by_caller_intro(turns: list[dict], idx: int, max_host_turns: int = 2) -> bool: + """Return True if a preceding HOST turn (within max_host_turns HOST turns) contains a caller-intro phrase.""" + host_count = 0 + for j in range(idx - 1, -1, -1): + if turns[j]["speaker"] == "HOST": + if _CALLER_INTRO.search(turns[j]["text"]): + return True + host_count += 1 + if host_count >= max_host_turns: + break + return False + + def _looks_like_question(text: str) -> bool: return bool(QUESTION_PATTERN.search(text)) diff --git a/projects/radio-show/audio-processor/src/transcriber.py b/projects/radio-show/audio-processor/src/transcriber.py index 6a9b026..df3410b 100644 --- a/projects/radio-show/audio-processor/src/transcriber.py +++ b/projects/radio-show/audio-processor/src/transcriber.py @@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str: def transcribe(audio_path: str | Path, model_size: str = "large-v3", - language: str = "en", device: str = "cuda") -> Transcript: - """Transcribe an audio file using faster-whisper.""" - from faster_whisper import WhisperModel + language: str = "en", device: str = "cuda", + batch_size: int = 16) -> Transcript: + """Transcribe an audio file using faster-whisper. + + Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work. + Word timestamps are skipped in batch mode (not needed for segment-level search). + Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps. + """ + from faster_whisper import WhisperModel, BatchedInferencePipeline audio_path = Path(audio_path) + use_batched = batch_size > 0 + console.print(f"[bold]Transcribing:[/bold] {audio_path.name}") - console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]") - - model = WhisperModel(model_size, device=device, compute_type="float16") - - segments_raw, info = model.transcribe( - str(audio_path), - language=language, - word_timestamps=True, - vad_filter=True, - vad_parameters=dict( - min_silence_duration_ms=500, - speech_pad_ms=200, - ), + console.print( + f"[dim]Model: {model_size} | " + f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | " + f"Device: {device}[/dim]" ) - console.print(f"[dim]Detected language: {info.language} " - f"(probability: {info.language_probability:.2f})[/dim]") - console.print(f"[dim]Duration: {info.duration:.1f}s " - f"({info.duration / 60:.1f} min)[/dim]") + if use_batched: + base_model = WhisperModel(model_size, device=device, compute_type="int8_float16") + model = BatchedInferencePipeline(model=base_model) + segments_raw, info = model.transcribe( + str(audio_path), + language=language, + batch_size=batch_size, + ) + else: + model = WhisperModel(model_size, device=device, compute_type="float16") + segments_raw, info = model.transcribe( + str(audio_path), + language=language, + word_timestamps=True, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200), + ) + + console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]") segments = [] - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("{task.completed} segments"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Transcribing...", total=None) - - for i, seg in enumerate(segments_raw): + for i, seg in enumerate(segments_raw): + words = [] + if not use_batched: words = [ - TranscriptWord( - word=w.word, - start=w.start, - end=w.end, - probability=w.probability, - ) + TranscriptWord(word=w.word, start=w.start, + end=w.end, probability=w.probability) for w in (seg.words or []) ] - segments.append(TranscriptSegment( - id=i, - text=seg.text, - start=seg.start, - end=seg.end, - words=words, - )) - progress.update(task, completed=i + 1) + segments.append(TranscriptSegment( + id=i, text=seg.text, start=seg.start, end=seg.end, words=words, + )) + if i % 50 == 0: + console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]") console.print(f"[green]Transcription complete: {len(segments)} segments[/green]") diff --git a/projects/radio-show/audio-processor/src/voice_profiler.py b/projects/radio-show/audio-processor/src/voice_profiler.py index 27b1d4a..7fe327a 100644 --- a/projects/radio-show/audio-processor/src/voice_profiler.py +++ b/projects/radio-show/audio-processor/src/voice_profiler.py @@ -319,8 +319,11 @@ class VoiceProfiler: best_match = name if best_score >= threshold: - if best_match and self.profiles[best_match].role == "host": + role = self.profiles[best_match].role if best_match else "unknown" + if role == "host": label = f"Host: {best_match}" + elif role == "cohost": + label = f"Cohost: {best_match}" else: label = best_match else: diff --git a/projects/radio-show/audio-processor/voice-profiles/profiles.json b/projects/radio-show/audio-processor/voice-profiles/profiles.json index 005007d..0dbf829 100644 --- a/projects/radio-show/audio-processor/voice-profiles/profiles.json +++ b/projects/radio-show/audio-processor/voice-profiles/profiles.json @@ -1,26 +1,34 @@ -{ - "Mike Swanson": { - "role": "host", - "num_samples": 180, - "source_episodes": [ - "2010-10-02-hr1.mp3", - "2011-06-04-hr1.mp3", - "2011-09-10-hr1.mp3", - "2014-s6e05.mp3", - "2015-s7e30.mp3", - "2016-s8e42.mp3", - "2017-s9e26.mp3", - "2018-s10e17.mp3", - "2018-s10e21.mp3", - "2010-10-02-hr1.mp3", - "2011-06-04-hr1.mp3", - "2011-09-10-hr1.mp3", - "2014-s6e05.mp3", - "2015-s7e30.mp3", - "2016-s8e42.mp3", - "2017-s9e26.mp3", - "2018-s10e17.mp3", - "2018-s10e21.mp3" - ] - } +{ + "Mike Swanson": { + "role": "host", + "num_samples": 180, + "source_episodes": [ + "2010-10-02-hr1.mp3", + "2011-06-04-hr1.mp3", + "2011-09-10-hr1.mp3", + "2014-s6e05.mp3", + "2015-s7e30.mp3", + "2016-s8e42.mp3", + "2017-s9e26.mp3", + "2018-s10e17.mp3", + "2018-s10e21.mp3", + "2010-10-02-hr1.mp3", + "2011-06-04-hr1.mp3", + "2011-09-10-hr1.mp3", + "2014-s6e05.mp3", + "2015-s7e30.mp3", + "2016-s8e42.mp3", + "2017-s9e26.mp3", + "2018-s10e17.mp3", + "2018-s10e21.mp3" + ] + }, + "Tom": { + "role": "cohost", + "num_samples": 44, + "source_episodes": [ + "2014-s6e19.mp3", + "2016-s8e43.mp3" + ] + } } \ No newline at end of file diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/composite.npy b/projects/radio-show/audio-processor/voice-profiles/tom/composite.npy new file mode 100644 index 0000000..861ee4b Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/composite.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0000.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0000.npy new file mode 100644 index 0000000..9ac521e Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0000.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0001.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0001.npy new file mode 100644 index 0000000..c80fe8b Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0001.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0002.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0002.npy new file mode 100644 index 0000000..a61f97a Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0002.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0003.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0003.npy new file mode 100644 index 0000000..fc027ed Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0003.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0004.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0004.npy new file mode 100644 index 0000000..4fe25c4 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0004.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0005.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0005.npy new file mode 100644 index 0000000..50a9af4 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0005.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0006.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0006.npy new file mode 100644 index 0000000..7987a01 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0006.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0007.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0007.npy new file mode 100644 index 0000000..e1d2628 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0007.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0008.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0008.npy new file mode 100644 index 0000000..5fd1913 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0008.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0009.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0009.npy new file mode 100644 index 0000000..2759f07 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0009.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0010.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0010.npy new file mode 100644 index 0000000..ca3086e Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0010.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0011.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0011.npy new file mode 100644 index 0000000..bf63d3e Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0011.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0012.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0012.npy new file mode 100644 index 0000000..cac9f13 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0012.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0013.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0013.npy new file mode 100644 index 0000000..47aa6c5 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0013.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0014.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0014.npy new file mode 100644 index 0000000..046eb8f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0014.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0015.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0015.npy new file mode 100644 index 0000000..da02cc0 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0015.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0016.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0016.npy new file mode 100644 index 0000000..e8bec0f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0016.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0017.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0017.npy new file mode 100644 index 0000000..e331f67 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0017.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0018.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0018.npy new file mode 100644 index 0000000..9d0ee2c Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0018.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0019.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0019.npy new file mode 100644 index 0000000..3c8cfc8 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0019.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0020.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0020.npy new file mode 100644 index 0000000..aaa8245 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0020.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0021.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0021.npy new file mode 100644 index 0000000..297d6c1 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0021.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0022.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0022.npy new file mode 100644 index 0000000..8392437 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0022.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0023.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0023.npy new file mode 100644 index 0000000..83b2fa8 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0023.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0024.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0024.npy new file mode 100644 index 0000000..ba5455f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0024.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0025.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0025.npy new file mode 100644 index 0000000..f93c02f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0025.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0026.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0026.npy new file mode 100644 index 0000000..d1642e6 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0026.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0027.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0027.npy new file mode 100644 index 0000000..ee58055 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0027.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0028.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0028.npy new file mode 100644 index 0000000..84f81f0 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0028.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0029.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0029.npy new file mode 100644 index 0000000..b92b838 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0029.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0030.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0030.npy new file mode 100644 index 0000000..0315000 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0030.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0031.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0031.npy new file mode 100644 index 0000000..bfe813f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0031.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0032.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0032.npy new file mode 100644 index 0000000..52d0853 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0032.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0033.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0033.npy new file mode 100644 index 0000000..e8045be Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0033.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0034.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0034.npy new file mode 100644 index 0000000..503e9b2 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0034.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0035.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0035.npy new file mode 100644 index 0000000..371a354 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0035.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0036.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0036.npy new file mode 100644 index 0000000..e24e45c Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0036.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0037.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0037.npy new file mode 100644 index 0000000..a464bf7 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0037.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0038.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0038.npy new file mode 100644 index 0000000..c490a17 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0038.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0039.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0039.npy new file mode 100644 index 0000000..3a0c2e5 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0039.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0040.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0040.npy new file mode 100644 index 0000000..b48c666 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0040.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0041.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0041.npy new file mode 100644 index 0000000..e478c30 Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0041.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0042.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0042.npy new file mode 100644 index 0000000..e1380ef Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0042.npy differ diff --git a/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0043.npy b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0043.npy new file mode 100644 index 0000000..534662f Binary files /dev/null and b/projects/radio-show/audio-processor/voice-profiles/tom/embedding_0043.npy differ diff --git a/projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md b/projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md new file mode 100644 index 0000000..69d0d00 --- /dev/null +++ b/projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md @@ -0,0 +1,251 @@ +# Session Log: Q&A Extraction — Co-Host Profile + Archive Indexing +**Date:** 2026-04-27 +**Project:** Radio Show Archive Mining — Computer Guru Show + +--- + +## User +- **User:** Mike Swanson (mike) +- **Machine:** DESKTOP-0O8A1RL +- **Role:** admin + +--- + +## Session Summary + +The session began with resuming work following a benchmark run that demonstrated a significant performance improvement in Whisper transcription, achieving 63.8x real-time speed with batched inference and int8_float16 settings. Next, the focus shifted to evaluating the quality of Q&A extraction across six test episodes, revealing a critical issue with false positives due to co-host Tom being mislabeled as CALLER based on a voice similarity threshold. + +A co-host voice profile for Tom was constructed using 44 embeddings from two specific episodes (2014-s6e19 and 2016-s8e43), producing a cosine similarity of 0.698 against Mike — well below Mike's 0.85 threshold, giving clean separation. Code was updated in `voice_profiler.py` and `diarizer.py` to correctly emit "Cohost: Tom" labels and map them to a new "CO-HOST" speaker tag. Re-diarizing the two co-host-era episodes dramatically cleaned up Q&A results: 2016 went from 12 false positives to 2 real WiFi caller pairs. + +Several bugs in `qa_extractor.py` were fixed: overlap resolution for sliding-window diarization boundaries, CALLER-preference threshold for long batch transcript segments, and a turn-based caller-intro lookback to replace an ineffective 120s time window. Phone-greeting detection and new promo signatures were added. The final Q&A count landed at 10 pairs across 6 episodes, with 2014 correctly yielding 0 (gaming co-host episode with no actual callers). + +`archive.db` was created with the ArchiveIndex schema (episodes, segments, segments_fts, qa_pairs, qa_fts). All 6 test episodes were indexed: 762 segments, 10 Q&A pairs. FTS5 search verified working for "router", "Windows 10", "Internet Explorer", "antivirus", and "connect" queries. + +--- + +## Key Decisions + +- **Co-host threshold uses same 0.85 bar as host**: Tom scores 0.698 vs Mike. Any voice >= 0.85 against Tom's composite gets labeled CO-HOST. Keeps the same single threshold for all profiles rather than per-profile thresholds. +- **Turn-based lookback for caller-intro (2 HOST turns, not 120s)**: Long HOST monologue blocks (8-10 min) in big show segments meant time-based lookback missed the caller introduction. Previous 2 HOST turns always catches it regardless of block length. +- **CALLER-preference at 4s minimum overlap**: Batch transcription produces ~26s segments; diarization CALLER windows are ~10s. Pure majority-vote always gave HOST. 4s minimum CALLER coverage labels the segment CALLER without being overly aggressive for co-host episodes. +- **Midpoint boundary resolution at load time**: Rather than re-diarizing everything, the sliding-window overlap is resolved in `load_diarized_transcript()` so it applies retroactively to all saved diarization files without touching the JSON. +- **751-1041 added as promo signal**: Earlier Tucson show number (vs 790-2040 in later seasons). Weighted 1 (needs a second semi-generic signal to filter). +- **Tom's windows sourced from first 60 min of co-host episodes**: Real callers don't call in during the first hour of a 2-hour show (only exceptions: very end of show). First-hour CALLER windows are safely all Tom. + +--- + +## Problems Encountered + +- **2016-s8e43 had 12 Q&A pairs, 11 false positives**: Root cause was Tom (co-host) labeled CALLER throughout. Fixed by building Tom's voice profile and re-diarizing. +- **2014-s6e19 had 2 Q&A pairs from gaming discussion**: Same co-host issue. After re-diarization: 0 pairs (correct — no actual callers in that gaming special). +- **2012-03-10 yielded 0 segments labeled CALLER**: Midpoint assignment hit HOST turns (HOST 0-20s and CALLER 15-30s — midpoint 15.1s falls in HOST). Fixed by overlap-preference assignment with 4s CALLER minimum. +- **Real WiFi caller (2016, ~4794s) was missing after first fix attempt**: Aggressive time-based lookback (120s) combined with short CALLER turns from sliding-window diarization caused the caller question to land in a HOST segment. Fixed by turn-based lookback + co-host profile (eliminated Tom noise, letting real caller windows survive). +- **2012-Jun pair at 1325s was a promo**: "The Computer Guru. We'll get your problem solved. Call 751-1041 today" passed promo filter. Fixed by adding 751-1041 and "we'll get your problem solved" as promo signatures. + +--- + +## Files Created / Modified + +### New files +``` +projects/radio-show/audio-processor/build_cohost_profile.py +projects/radio-show/audio-processor/index_test_episodes.py +projects/radio-show/audio-processor/archive.db +projects/radio-show/audio-processor/voice-profiles/tom/ +projects/radio-show/audio-processor/voice-profiles/profiles.json (updated: Tom added) +projects/radio-show/session-logs/2026-04-27-qa-extraction-cohost-indexing.md (this file) +``` + +### Modified +``` +src/voice_profiler.py — emit "Cohost: " label for cohost role +src/diarizer.py — map "Cohost:" prefix to "CO-HOST" speaker +src/qa_extractor.py — overlap resolution, CALLER-preference, turn-based + caller-intro lookback, _preceded_by_caller_intro(), + _PHONE_GREETING, 751-1041 + promo sig additions +test-data/transcripts/2014-s6e19/diarization.json (re-diarized with Tom profile) +test-data/transcripts/2016-s8e43/diarization.json (re-diarized with Tom profile) +``` + +--- + +## Benchmark Results (from previous run — baseline for BEAST comparison) + +**Machine:** DESKTOP-0O8A1RL — NVIDIA GeForce RTX 5070 Ti Laptop GPU + +| Episode | Audio | Wall (diarize) | RTF | +|---------|-------|----------------|-----| +| 2011-03-12-hr1 | 2509s | 15.1s | 166.1x | +| 2012-03-10-hr1 | 2634s | 12.2s | 215.5x | +| 2012-06-09-hr1 | 2648s | 12.2s | 216.8x | +| 2014-s6e19 | 2914s | 13.4s | 216.9x | +| 2016-s8e43 | 5326s | 24.2s | 219.6x | +| 2017-s9e30 | 5343s | 24.7s | 216.4x | +| **TOTAL** | **21374s** | **101.9s** | **209.7x** | + +Transcription (batched Whisper large-v3): 63.8x realtime +Diarization: 209.7x realtime +vs DESKTOP-0O8A1RL baseline (149.5x): **+60.2x (+40.3%)** + +--- + +## Archive DB State + +**Path:** `projects/radio-show/audio-processor/archive.db` + +``` +Episodes : 6 +Segments : 762 +Q&A pairs: 10 +``` + +**Q&A pairs by episode:** +| Episode | Pairs | Notes | +|---------|-------|-------| +| 2011-03-12-hr1 | 3 | IE lockout call, cloud computing, ghost hunting caller | +| 2012-03-10-hr1 | 1 | iPad 3 discussion | +| 2012-06-09-hr1 | 1 | Windows repair feature call | +| 2014-s6e19 | 0 | Gaming co-host special — no actual callers | +| 2016-s8e43 | 2 | WiFi connectivity caller (2 turns of same call) | +| 2017-s9e30 | 3 | Software control, Cat5 cabling (Charlie), WiFi ports | + +--- + +## Voice Profiles State + +**Path:** `projects/radio-show/audio-processor/voice-profiles/` + +| Name | Role | Embeddings | Source Episodes | +|------|------|-----------|-----------------| +| Mike Swanson | host | 180 | 9 episodes (2010-2018) | +| Tom | cohost | 44 | 2014-s6e19, 2016-s8e43 | + +Tom vs Mike cosine similarity: **0.698** (well-separated at 0.85 threshold) + +**Tom's source windows used:** +- 2014-s6e19: 195-260s, 320-425s, 600-650s, 675-710s +- 2016-s8e43: 100-115s, 135-160s, 270-295s, 575-605s, 1185-1235s, 1790-1870s, 2020-2055s + +--- + +## Co-Host Era Notes + +Tom was the regular in-studio co-host/board-op roughly 2013-2016. His voice is in episodes from at least 2014 through 2016 (confirmed from test set). The 2011 and 2012 episodes are pure call-in format with no co-host. + +If there are occasional guest co-hosts or fill-in hosts in other years, they would still be labeled CALLER until profiled. These would be rare and would likely not form question patterns that survive the caller-intro gate. + +--- + +## Pending Tasks for BEAST (GURU-BEAST-ROG) + +### 1. Run benchmark.py to establish RTX 4090 baseline + +```bash +cd D:/claudetools/projects/radio-show/audio-processor +.venv/Scripts/python benchmark.py 2>&1 | tee bench-4090.txt +``` + +BENCH_SETUP.md has all setup steps. The voice profiles are in `voice-profiles/` (already copied or available via Tailscale/robocopy from DESKTOP-0O8A1RL). Test episodes go in `test-data/episodes/`. + +Expected: diarization RTF should be ~250-300x on RTX 4090 (vs 209.7x on laptop 5070 Ti). Transcription should be ~70-80x. + +Update `benchmark.py` line 27 after measuring: +```python +BASELINE_RTF = 209.7 # current laptop 5070 Ti baseline +``` + +### 2. Download full archive from IX server (172.16.3.10) + +Use paramiko (SSH with key agent disabled): +```python +import paramiko +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect("172.16.3.10", username="gurushow", password="", + look_for_keys=False, allow_agent=False) +``` + +Archive path: `/home/gurushow/public_html/archive/Radio/` +Episode count: 579 MP3s across 2010-2018 (no 2013 season) +Approximate total size: ~30-40 GB + +Download script skeleton in prior session log: `2026-04-27-diarization-pipeline.md` + +**Tailscale required** — IX server is at 172.16.3.10, requires VPN. + +### 3. Full archive processing + +Once episodes are downloaded: + +```bash +# Transcribe + diarize all episodes +cd D:/claudetools/projects/radio-show/audio-processor +.venv/Scripts/python diarize_training.py # or a new batch_process_all.py + +# Index everything into archive.db +.venv/Scripts/python index_test_episodes.py # modify to point at full episodes dir +``` + +The pipeline is idempotent — `add_segments()` skips episodes already indexed. + +### 4. Verify co-host era episodes + +2013-2016 era episodes should now correctly separate Tom (CO-HOST) from actual callers. Spot-check a few 2015 episodes after processing to confirm Tom's profile generalizes well. + +If any 2015/2016 episodes show too many CALLER turns that are clearly Tom (voice changed slightly over years), re-run `build_cohost_profile.py` with windows from that episode added to TOM_WINDOWS dict. + +--- + +## Technical Reference + +### Key thresholds + +```python +host_match_threshold = 0.85 # WavLM cosine similarity — applied to ALL profiles +CALLER_MIN_S = 4.0 # min CALLER coverage in transcript segment to label CALLER +PROMO_SCORE_THRESHOLD = 2 # weighted promo signature score +MIN_QUESTION_DURATION = 5.0 # seconds +MIN_ANSWER_DURATION = 15.0 # seconds +MAX_GAP_BETWEEN_QA = 30.0 # seconds +``` + +### Diarization sliding window + +```python +window_s = 10.0 # 10s embedding windows +hop_s = 5.0 # 5s hop → overlapping boundaries (resolved at load time) +``` + +### Transcription (batch mode) + +```python +model_size = "large-v3" +compute_type = "int8_float16" +batch_size = 16 +# No word timestamps in batch mode (not needed for search/diarization) +``` + +### DB search examples + +```python +from src.indexer import ArchiveIndex +from pathlib import Path + +with ArchiveIndex(Path("archive.db")) as idx: + # Segment search + results = idx.search("router", limit=20) + results = idx.search("Windows 10", speaker_filter="HOST", limit=10) + + # Q&A search + qa = idx.search_qa("antivirus", limit=10) + qa = idx.search_qa("wifi connect", limit=10) +``` + +### Archive server + +``` +Host: 172.16.3.10 (requires Tailscale) +User: gurushow +Archive root: /home/gurushow/public_html/archive/Radio/ +SSH: paramiko with look_for_keys=False, allow_agent=False +```