diff --git a/projects/radio-show/audio-processor/BENCH_SETUP.md b/projects/radio-show/audio-processor/BENCH_SETUP.md new file mode 100644 index 0000000..0d0cfa5 --- /dev/null +++ b/projects/radio-show/audio-processor/BENCH_SETUP.md @@ -0,0 +1,133 @@ +# GURU-BEAST-ROG Benchmark Setup + +RTX 4090 performance comparison against DESKTOP-0O8A1RL (RTX 5070 Ti baseline: **149.5x realtime**). + +--- + +## Step 1 — Sync repo + +The audio-processor lives inside the claudetools repo. Pull latest on main. + +```powershell +cd D:\claudetools # or wherever claudetools is cloned on this machine +git pull +``` + +If not yet cloned: +```powershell +git clone https://azcomputerguru@git.azcomputerguru.com/azcomputerguru/claudetools.git D:\claudetools +cd D:\claudetools\projects\radio-show\audio-processor +``` + +--- + +## Step 2 — Python environment + +Requires Python 3.11+. Use `py` launcher on Windows. + +```powershell +cd D:\claudetools\projects\radio-show\audio-processor + +py -m venv .venv +.venv\Scripts\activate + +# PyTorch with CUDA 12.8 (matches RTX 4090 driver) +pip install torch==2.11.0+cu128 --index-url https://download.pytorch.org/whl/cu128 + +# Core deps +pip install faster-whisper==1.2.1 transformers==5.6.2 soundfile==0.13.1 +pip install numpy==2.4.4 rich==15.0.0 ollama==0.6.1 pyyaml scikit-learn + +# Install project in editable mode +pip install -e . --no-deps +``` + +Verify GPU is visible: +```powershell +.venv\Scripts\python -c "import torch; print(torch.cuda.get_device_name(0))" +``` + +--- + +## Step 3 — Copy voice profiles from DESKTOP-0O8A1RL + +Voice profiles are not in git (binary numpy files). Copy from the 5070 Ti machine via Tailscale. +DESKTOP-0O8A1RL Tailscale IP: **100.92.127.64** + +```powershell +# From GURU-BEAST-ROG — pulls the voice-profiles directory over Tailscale +robocopy "\\100.92.127.64\claudetools\projects\radio-show\audio-processor\voice-profiles" ` + "D:\claudetools\projects\radio-show\audio-processor\voice-profiles" /E /COPYALL +``` + +If the network share isn't available, copy manually or use scp: +```powershell +scp -r mike@100.92.127.64:"D:/claudetools/projects/radio-show/audio-processor/voice-profiles" . +``` + +Expected contents after copy: +``` +voice-profiles/ + profiles.json + mike-swanson/ + composite.npy + embedding_0000.npy ... embedding_0179.npy (180 files) +``` + +--- + +## Step 4 — Download test episodes from IX server + +Tailscale must be running. IX server: **172.16.3.10** (use Python paramiko — raw SSH has key agent interference). + +```powershell +.venv\Scripts\python - << 'EOF' +import paramiko, os +client = paramiko.SSHClient() +client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +client.connect('172.16.3.10', username='root', password='Gptf*77ttb!@#!@#', + look_for_keys=False, allow_agent=False, timeout=30) +sftp = client.open_sftp() + +os.makedirs('test-data/episodes', exist_ok=True) + +downloads = [ + ('/home/gurushow/public_html/archive/2011/3-12-11 HR 1.mp3', 'test-data/episodes/2011-03-12-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/3 - March/3-10-12HR1.mp3','test-data/episodes/2012-03-10-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/6 - June/6-9-12-HR1.mp3', 'test-data/episodes/2012-06-09-hr1.mp3'), + ('/home/gurushow/public_html/archive/2014/06/s6e19.mp3', 'test-data/episodes/2014-s6e19.mp3'), + ('/home/gurushow/public_html/archive/2016/06/s8e43.mp3', 'test-data/episodes/2016-s8e43.mp3'), + ('/home/gurushow/public_html/archive/2017/04/s9e30.mp3', 'test-data/episodes/2017-s9e30.mp3'), +] + +for remote, local in downloads: + size_mb = sftp.stat(remote).st_size / 1024 / 1024 + print(f'Downloading {local} ({size_mb:.1f} MB)...', flush=True) + sftp.get(remote, local) + print(' done', flush=True) + +sftp.close() +client.close() +print('All downloads complete.') +EOF +``` + +--- + +## Step 5 — Run benchmark + +```powershell +.venv\Scripts\python benchmark.py +``` + +This diarizes all 6 test episodes, prints per-episode timing, and compares to the 5070 Ti baseline. + +--- + +## Step 6 — Report results + +Post the benchmark output in the session log or share back to DESKTOP-0O8A1RL. + +The key number to compare: **total realtime factor** (5070 Ti got 149.5x). + +Also note any Q&A pair count differences — same episodes should produce same pairs on both machines (results are deterministic given the same voice profiles). diff --git a/projects/radio-show/audio-processor/benchmark.py b/projects/radio-show/audio-processor/benchmark.py new file mode 100644 index 0000000..000470d --- /dev/null +++ b/projects/radio-show/audio-processor/benchmark.py @@ -0,0 +1,198 @@ +""" +Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes. +Reports per-episode and total realtime factors. +Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization. +""" +import sys, os, time + +os.environ["PYTHONIOENCODING"] = "utf-8" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") +os.environ["TRANSFORMERS_OFFLINE"] = "1" + +from pathlib import Path +from src.gpu import ensure_cuda_libs +ensure_cuda_libs() + +import torch +from src.config import load_config +from src.diarizer import diarize, VoiceProfileStore +from src.qa_extractor import load_diarized_transcript, extract_qa_pairs +from rich.console import Console +from rich.table import Table + +console = Console() + +BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)" +BASELINE_RTF = 149.5 # realtime factor measured 2026-04-27 + +BASE = Path(__file__).parent +EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3")) +TRANS_DIR = BASE / "test-data" / "transcripts" + +config = load_config() +device = "cuda" if torch.cuda.is_available() else "cpu" + +if not EPISODES: + console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]") + sys.exit(1) + +console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]") +console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else "")) +console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime") +console.print(f"Episodes: {len(EPISODES)}\n") + +voice_profiles = VoiceProfileStore( + config.resolve_path(config.diarization.voice_profiles_dir) +) +if not voice_profiles.embeddings: + console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]") + sys.exit(1) + +# ── Phase 1: Transcription ───────────────────────────────────────────────── + +console.print("[bold]Phase 1: Transcription[/bold]") + +trans_results = [] +trans_total_audio = 0.0 +trans_total_wall = 0.0 + +for ep in EPISODES: + trans_ep_dir = TRANS_DIR / ep.stem + trans_ep_dir.mkdir(parents=True, exist_ok=True) + transcript_path = trans_ep_dir / "transcript.json" + + if transcript_path.exists(): + import json + with open(transcript_path) as f: + td = json.load(f) + dur = td.get("duration", 0) + console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]") + trans_results.append((ep, transcript_path, dur, 0.0)) + continue + + console.print(f" Transcribing {ep.name}...") + t0 = time.monotonic() + + from faster_whisper import WhisperModel + if not hasattr(sys, "_whisper_model"): + console.print(" [dim]Loading Whisper large-v3...[/dim]") + sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16") + + model = sys._whisper_model + segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5) + + import json + segs = [] + for seg in segments_iter: + segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text}) + + duration = info.duration + wall = time.monotonic() - t0 + rtf = duration / wall + + result = {"duration": duration, "language": "en", "segments": segs} + with open(transcript_path, "w") as f: + json.dump(result, f) + + console.print(f" [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]") + trans_results.append((ep, transcript_path, duration, wall)) + trans_total_audio += duration + trans_total_wall += wall + +if trans_total_wall > 0: + console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n") + +# ── Phase 2: Diarization ─────────────────────────────────────────────────── + +console.print("[bold]Phase 2: Diarization[/bold]") + +diar_rows = [] +diar_total_audio = 0.0 +diar_total_wall = 0.0 + +for ep, transcript_path, audio_dur, _ in trans_results: + trans_ep_dir = TRANS_DIR / ep.stem + diarization_path = trans_ep_dir / "diarization.json" + + if audio_dur == 0: + import json + with open(transcript_path) as f: + audio_dur = json.load(f).get("duration", 0) + + t0 = time.monotonic() + result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85) + wall = time.monotonic() - t0 + rtf = audio_dur / wall if wall > 0 else 0 + + result.save(trans_ep_dir) + + host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST") + caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER") + + diar_rows.append({ + "episode": ep.stem, + "audio_s": audio_dur, + "wall_s": wall, + "rtf": rtf, + "turns": len(result.turns), + "host_s": host_s, + "caller_s": caller_s, + }) + diar_total_audio += audio_dur + diar_total_wall += wall + + console.print( + f" {ep.stem}: {len(result.turns)} turns | " + f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s " + f"[{wall:.1f}s wall / {rtf:.1f}x realtime]" + ) + +total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0 + +# ── Phase 3: Q&A extraction ──────────────────────────────────────────────── + +console.print("\n[bold]Phase 3: Q&A Extraction[/bold]") + +qa_rows = [] +for ep, transcript_path, audio_dur, _ in trans_results: + trans_ep_dir = TRANS_DIR / ep.stem + diarization_path = trans_ep_dir / "diarization.json" + segments = load_diarized_transcript(transcript_path, diarization_path) + pairs = extract_qa_pairs(segments) + qa_rows.append((ep.stem, len(pairs))) + console.print(f" {ep.stem}: {len(pairs)} Q&A pairs") + +# ── Summary ──────────────────────────────────────────────────────────────── + +console.print() +table = Table(title="Diarization Benchmark Results", show_footer=True) +table.add_column("Episode", footer="TOTAL") +table.add_column("Audio", footer=f"{diar_total_audio:.0f}s") +table.add_column("Wall", footer=f"{diar_total_wall:.1f}s") +table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]") +table.add_column("Turns") +table.add_column("Q&A pairs") + +for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows): + table.add_row( + row["episode"], + f"{row['audio_s']:.0f}s", + f"{row['wall_s']:.1f}s", + f"{row['rtf']:.1f}x", + str(row["turns"]), + str(qa_count), + ) + +console.print(table) + +delta = total_rtf - BASELINE_RTF +sign = "+" if delta >= 0 else "" +console.print( + f"\n[bold]vs {BASELINE_RTX}:[/bold] " + f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x " + f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)" +) +console.print( + f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}" +) diff --git a/projects/radio-show/audio-processor/check_scores.py b/projects/radio-show/audio-processor/check_scores.py new file mode 100644 index 0000000..0665350 --- /dev/null +++ b/projects/radio-show/audio-processor/check_scores.py @@ -0,0 +1,84 @@ +""" +Quick diagnostic: print per-window WavLM similarity scores for one episode. +Run before diarize_training.py to understand score distribution. +""" +import sys +import os + +os.environ["PYTHONIOENCODING"] = "utf-8" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") +os.environ["TRANSFORMERS_OFFLINE"] = "1" + +from pathlib import Path +import numpy as np +from src.gpu import ensure_cuda_libs +ensure_cuda_libs() + +from src.voice_profiler import VoiceProfiler +from src.config import load_config +from rich.console import Console + +console = Console() + +BASE = Path(__file__).parent +config = load_config() +profiles_dir = config.resolve_path(config.diarization.voice_profiles_dir) + +import torch +device = "cuda" if torch.cuda.is_available() else "cpu" +console.print(f"Device: {device}") + +profiler = VoiceProfiler(profiles_dir, device=device) + +if not profiler.profiles: + console.print("[red]No voice profiles loaded[/red]") + sys.exit(1) + +# Use the first available episode +episodes = sorted((BASE / "training-data" / "episodes").glob("*.mp3")) +if not episodes: + console.print("[red]No episodes found[/red]") + sys.exit(1) + +ep = episodes[0] +console.print(f"\nAnalyzing first 20 minutes of: {ep.name}") +console.print("Format: [time] similarity_score label\n") + +duration = profiler._get_duration(ep) +# Scan 10-40 minutes — intro monologue usually ends before 10 min, callers appear after +scan_start = min(600.0, duration * 0.15) # ~10 min in or 15% +scan_end = min(duration, 2400.0) # up to 40 min + +window_s = 10.0 +hop_s = 30.0 # coarse pass — one window per 30s for speed + +scores = [] +for start in np.arange(scan_start, scan_end - window_s, hop_s): + end = start + window_s + try: + emb = profiler.extract_embedding(ep, start, end) + best_score = 0.0 + best_name = "" + for name, profile in profiler.profiles.items(): + s = profile.similarity(emb) + if s > best_score: + best_score = s + best_name = name + + label = f"HOST ({best_name})" if best_score >= 0.85 else ( + f"CALLER (below 0.85)" if best_score >= 0.70 else "UNKNOWN" + ) + console.print(f" [{start:6.0f}s-{end:.0f}s] {best_score:.4f} {label}") + scores.append(best_score) + except Exception as e: + console.print(f" [{start:6.0f}s] ERROR: {e}") + +if scores: + console.print(f"\nScore distribution over first 20 min:") + console.print(f" min={min(scores):.4f} max={max(scores):.4f} mean={np.mean(scores):.4f} median={np.median(scores):.4f}") + buckets = [0.0, 0.6, 0.7, 0.75, 0.80, 0.85, 0.90, 0.95, 1.01] + for lo, hi in zip(buckets, buckets[1:]): + count = sum(1 for s in scores if lo <= s < hi) + bar = "#" * count + console.print(f" [{lo:.2f}-{hi:.2f}): {count:3d} {bar}") diff --git a/projects/radio-show/audio-processor/diarize_2018.py b/projects/radio-show/audio-processor/diarize_2018.py new file mode 100644 index 0000000..f80e2c8 --- /dev/null +++ b/projects/radio-show/audio-processor/diarize_2018.py @@ -0,0 +1,161 @@ +""" +Re-diarize the two 2018 episodes that had stale diarization, then +patch them into the existing archive DB. Also times the run to +validate the audio-preload optimization. +""" +import sys +import os +import time + +os.environ["PYTHONIOENCODING"] = "utf-8" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") +if hasattr(sys.stderr, "reconfigure"): + sys.stderr.reconfigure(encoding="utf-8") +os.environ["TRANSFORMERS_OFFLINE"] = "1" + +from pathlib import Path +from src.gpu import ensure_cuda_libs +ensure_cuda_libs() + +from src.config import load_config +from src.diarizer import diarize, VoiceProfileStore +from src.indexer import ArchiveIndex +from src.qa_extractor import load_diarized_transcript, extract_qa_pairs, tag_qa_pairs_with_ollama +from rich.console import Console +import re, json + +console = Console() + +BASE = Path(__file__).parent +EPISODES_DIR = BASE / "training-data" / "episodes" +TRANSCRIPTS_DIR = BASE / "training-data" / "transcripts" +DB_PATH = BASE / "archive" / "archive.db" + +config = load_config() +voice_profiles = VoiceProfileStore( + config.resolve_path(config.diarization.voice_profiles_dir) +) + +targets = ["2018-s10e17", "2018-s10e21"] +episodes = [EPISODES_DIR / f"{stem}.mp3" for stem in targets] + +console.print("[bold]Re-diarizing 2018 episodes (optimized audio preload)[/bold]\n") + +total_audio_s = 0 +total_wall_s = 0 + +for ep_path in episodes: + if not ep_path.exists(): + console.print(f"[red]Missing: {ep_path.name}[/red]") + continue + + stem = ep_path.stem + transcript_dir = TRANSCRIPTS_DIR / stem + + t0 = time.monotonic() + result = diarize(ep_path, voice_profiles=voice_profiles, + host_match_threshold=0.85) + wall = time.monotonic() - t0 + + result.save(transcript_dir) + + audio_dur = result.turns[-1].end if result.turns else 0 + rtf = audio_dur / wall if wall > 0 else 0 + total_audio_s += audio_dur + total_wall_s += wall + + speakers = result.speakers_ranked() + console.print( + f" {stem}: {len(result.turns)} turns | " + + ", ".join(f"{s} ({t:.0f}s)" for s, t in speakers[:3]) + + f" [{wall:.1f}s wall / {rtf:.1f}x realtime]" + ) + +if total_wall_s > 0: + console.print( + f"\n[bold]Speed:[/bold] {total_audio_s:.0f}s audio in {total_wall_s:.1f}s " + f"= {total_audio_s/total_wall_s:.1f}x realtime" + ) + +# Patch just these two episodes into the existing DB +console.print("\n[bold]Patching DB...[/bold]") + +def episode_id(stem): + return re.sub(r"-hr\d$", "", stem, flags=re.IGNORECASE) + +with ArchiveIndex(DB_PATH) as idx: + for ep_path in episodes: + if not ep_path.exists(): + continue + stem = ep_path.stem + transcript_dir = TRANSCRIPTS_DIR / stem + transcript_path = transcript_dir / "transcript.json" + diarization_path = transcript_dir / "diarization.json" + + if not transcript_path.exists(): + console.print(f"[yellow]No transcript: {stem}[/yellow]") + continue + + ep_id = episode_id(stem) + + with open(transcript_path) as f: + td = json.load(f) + duration = td.get("duration") + + date_m = re.search(r"(\d{4}-\d{2}-\d{2})", stem) + date = date_m.group(1) if date_m else None + + segments = load_diarized_transcript(transcript_path, diarization_path) + + # Remove old rows then re-add. FTS5 content tables are rebuilt at the end. + idx._conn.execute("DELETE FROM segments WHERE episode_id = ?", (ep_id,)) + idx._conn.execute("DELETE FROM qa_pairs WHERE episode_id = ?", (ep_id,)) + idx._conn.commit() + + idx.add_episode(ep_id, ep_path, date=date, duration=duration) + # Bypass add_segments guard (it skips if rows already exist) + idx._conn.executemany( + "INSERT INTO segments (episode_id, seg_index, start, end, speaker, text) " + "VALUES (?, ?, ?, ?, ?, ?)", + [ + (ep_id, i, s["start"], s["end"], s.get("speaker", "UNKNOWN"), s["text"]) + for i, s in enumerate(segments) + ] + ) + idx._conn.commit() + + host_segs = sum(1 for s in segments if s["speaker"] == "HOST") + other_segs = len(segments) - host_segs + console.print(f" {ep_id}: {len(segments)} segs (HOST={host_segs}, other={other_segs})") + + pairs = extract_qa_pairs(segments) + console.print(f" {len(pairs)} Q&A pairs", end="") + + if pairs: + console.print(f" — tagging with Ollama...", end="") + pairs = tag_qa_pairs_with_ollama( + pairs, ollama_host=config.llm.ollama_host, model=config.llm.model + ) + + for pair in pairs: + idx.add_qa_pair( + ep_id, + pair.question_start, pair.question_end, + pair.answer_start, pair.answer_end, + pair.question_text, pair.answer_text, + topic=pair.topic, tags=pair.topic_tags, + ) + console.print() + + # Rebuild FTS indexes — required after manual DELETE/re-INSERT on content tables + idx._conn.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')") + idx._conn.execute("INSERT INTO qa_fts(qa_fts) VALUES('rebuild')") + idx._conn.commit() + + stats = idx.stats() + +console.print(f"\n[bold green]Done.[/bold green] DB now: " + f"{stats['episodes']} episodes | " + f"{stats['segments']} segments | " + f"{stats['qa_pairs']} Q&A pairs") diff --git a/projects/radio-show/audio-processor/diarize_training.py b/projects/radio-show/audio-processor/diarize_training.py new file mode 100644 index 0000000..0a5be70 --- /dev/null +++ b/projects/radio-show/audio-processor/diarize_training.py @@ -0,0 +1,148 @@ +""" +Diarize all training episodes, saving diarization.json next to each transcript. +Then rebuild the archive DB with proper HOST/CALLER labels. +""" + +import sys +import os + +# Force UTF-8 output on Windows so Rich's Braille spinner characters don't crash +os.environ["PYTHONIOENCODING"] = "utf-8" +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") +if hasattr(sys.stderr, "reconfigure"): + sys.stderr.reconfigure(encoding="utf-8") + +# Prevent transformers from checking HuggingFace for model updates on every +# from_pretrained() call — models are already cached locally. +os.environ["TRANSFORMERS_OFFLINE"] = "1" + +from pathlib import Path + +# Ensure CUDA libs before any torch imports +from src.gpu import ensure_cuda_libs +ensure_cuda_libs() + +from src.config import load_config +from src.diarizer import diarize, VoiceProfileStore +from src.indexer import ArchiveIndex +from src.qa_extractor import load_diarized_transcript, extract_qa_pairs, tag_qa_pairs_with_ollama +from rich.console import Console + +console = Console() + +BASE = Path(__file__).parent +EPISODES_DIR = BASE / "training-data" / "episodes" +TRANSCRIPTS_DIR = BASE / "training-data" / "transcripts" +DB_PATH = BASE / "archive" / "archive.db" + +config = load_config() + +# Load voice profiles +voice_profiles = VoiceProfileStore( + config.resolve_path(config.diarization.voice_profiles_dir) +) + +episodes = sorted(EPISODES_DIR.glob("*.mp3")) +console.print(f"[bold]Diarizing {len(episodes)} training episodes[/bold]") + +# ── Step 1: Diarize ─────────────────────────────────────────────────────────── +for i, ep_path in enumerate(episodes, 1): + stem = ep_path.stem + transcript_dir = TRANSCRIPTS_DIR / stem + if not transcript_dir.exists(): + console.print(f"[{i}/{len(episodes)}] [yellow]No transcript dir: {stem} — skipping[/yellow]") + continue + + diarization_out = transcript_dir / "diarization.json" + if diarization_out.exists(): + console.print(f"[{i}/{len(episodes)}] [dim]Already diarized: {stem}[/dim]") + continue + + console.print(f"\n[{i}/{len(episodes)}] Diarizing: {stem}") + try: + result = diarize(ep_path, voice_profiles=voice_profiles, + min_speakers=config.diarization.min_speakers, + max_speakers=config.diarization.max_speakers, + host_match_threshold=0.85) + result.save(transcript_dir) + speakers = result.speakers_ranked() + console.print(f" Done — {len(result.turns)} turns | top speakers: " + + ", ".join(f"{s} ({t:.0f}s)" for s, t in speakers[:3])) + except Exception as e: + console.print(f" [red]FAILED: {e}[/red]") + import traceback; traceback.print_exc() + +# ── Step 2: Rebuild DB ──────────────────────────────────────────────────────── +console.print("\n[bold]Rebuilding archive DB with diarization...[/bold]") + +if DB_PATH.exists(): + DB_PATH.unlink() + console.print("[dim]Cleared existing DB[/dim]") + +import re, json + +def episode_id(stem): + return re.sub(r"-hr\d$", "", stem, flags=re.IGNORECASE) + +with ArchiveIndex(DB_PATH) as idx: + for ep_path in episodes: + stem = ep_path.stem + transcript_dir = TRANSCRIPTS_DIR / stem + transcript_path = transcript_dir / "transcript.json" + diarization_path = transcript_dir / "diarization.json" + + if not transcript_path.exists(): + console.print(f"[yellow]No transcript: {stem} — skipping[/yellow]") + continue + + ep_id = episode_id(stem) + date_m = re.search(r"(\d{4}-\d{2}-\d{2})", stem) + date = date_m.group(1) if date_m else None + + with open(transcript_path) as f: + td = json.load(f) + duration = td.get("duration") + + segments = load_diarized_transcript( + transcript_path, + diarization_path if diarization_path.exists() else None + ) + + idx.add_episode(ep_id, ep_path, date=date, duration=duration) + idx.add_segments(ep_id, segments) + + # Speaker breakdown + host_segs = sum(1 for s in segments if s["speaker"] == "HOST") + caller_segs = sum(1 for s in segments if s["speaker"] in ("CALLER", "UNKNOWN")) + console.print(f" {ep_id}: {len(segments)} segs " + f"(HOST={host_segs}, other={caller_segs})") + + # Extract Q&A pairs + pairs = extract_qa_pairs(segments) + console.print(f" {len(pairs)} Q&A pairs", end="") + + if pairs: + console.print(f" — tagging with Ollama...", end="") + pairs = tag_qa_pairs_with_ollama( + pairs, ollama_host=config.llm.ollama_host, model=config.llm.model + ) + + for pair in pairs: + idx.add_qa_pair( + ep_id, + pair.question_start, pair.question_end, + pair.answer_start, pair.answer_end, + pair.question_text, pair.answer_text, + topic=pair.topic, tags=pair.topic_tags, + ) + + console.print() + + stats = idx.stats() + +console.print(f"\n[bold green]Done.[/bold green] " + f"{stats['episodes']} episodes | " + f"{stats['segments']} segments | " + f"{stats['qa_pairs']} Q&A pairs") +console.print(f"DB: {DB_PATH}") diff --git a/projects/radio-show/audio-processor/session-logs/2026-04-27-diarization-pipeline.md b/projects/radio-show/audio-processor/session-logs/2026-04-27-diarization-pipeline.md new file mode 100644 index 0000000..a458233 --- /dev/null +++ b/projects/radio-show/audio-processor/session-logs/2026-04-27-diarization-pipeline.md @@ -0,0 +1,135 @@ +# Session Log — 2026-04-27 + +**Project:** The Computer Guru Show — Archive Mining System +**Goal:** Build searchable transcript archive of 579 episodes (2010-2018) with caller Q&A extraction for "then vs now" show prep +**Machine:** DESKTOP-0O8A1RL +**User:** Mike Swanson (mike) + +--- + +## Work Completed + +### Critical Bug Fix — `voice_profiler.py` `identify_speakers()` + +`identify_speakers()` was unconditionally labeling all windows as HOST regardless of similarity score. The host-role label assignment ran after the threshold check and overwrote it. Fixed by gating the "Host:" label inside the `best_score >= threshold` branch. + +### Threshold Tuning + +Raised similarity threshold from 0.70 to 0.85. Diagnostic run on `2010-10-02-hr1.mp3` confirmed clean separation: + +- Mike's voice: scores 0.90-0.99 +- Caller windows: scores 0.46-0.83 + +### Audio Preload Optimization + +`identify_speakers()` previously spawned approximately 500 ffmpeg subprocesses per episode (one per 10-second window). Rewrote to load full audio once via `_load_full_audio()` and slice in-memory numpy arrays per window. + +**Result: 149.5x realtime on RTX 5070 Ti** +Measured: 10,600 seconds of audio processed in 70.9 seconds. + +### Promo/Bumper Filter — `qa_extractor.py` + +Added `_is_promo_or_bumper()` with weighted signature scoring: + +- Score 2 = highly distinctive phrase +- Score 1 = semi-generic phrase +- Threshold = 2 + +Filters show promos such as "Computer running slow? Has your machine somehow acquired a life of its own?" from Q&A pairs. Reduced false positives from 42 to 27 pairs across 9 training episodes. + +### 2018 Episode Re-diarization + +Episodes `2018-s10e17` and `2018-s10e21` had stale all-HOST diarization from an aborted earlier run. Re-diarized correctly: + +- `2018-s10e17`: 49 turns / 775s caller +- `2018-s10e21`: 110 turns / 1175s caller + +### Text-Only Q&A Fallback + +Added `_extract_qa_text_only()` to handle cases where diarization produces no CALLER labels. Uses question-pattern signals and caller-intro phrase detection. Automatically triggered when all segments are labeled HOST. + +### TRANSFORMERS_OFFLINE=1 + +Set in `diarize_training.py` and `diarize_2018.py` to prevent HuggingFace freshness checks on the cached WavLM model. + +### HuggingFace / Model Note + +WavLM (`microsoft/wavlm-base-sv`) is ungated and sufficient for speaker verification. pyannote was evaluated but not needed. + +--- + +## Key Files Modified + +| File | Change | +|---|---| +| `src/voice_profiler.py` | Threshold bug fix, audio preload optimization, `_embed_audio_np()`, `_load_full_audio()` | +| `src/qa_extractor.py` | Promo filter (`_is_promo_or_bumper()`), text-only fallback (`_extract_qa_text_only()`) | +| `src/diarizer.py` | Default threshold raised to 0.85 | +| `diarize_training.py` | TRANSFORMERS_OFFLINE=1, threshold=0.85 | +| `diarize_2018.py` | New targeted script for 2018 re-diarization and DB patch | +| `check_scores.py` | Diagnostic script — keep for future threshold tuning | + +--- + +## Training Set (archive/archive.db) + +9 episodes, 17,555 segments, 27 Q&A pairs total. + +| Episode ID | File | Duration | Caller Segs | Q&A Pairs | +|---|---|---|---|---| +| 2010-10-02 | 2010-10-02-hr1.mp3 | 44m36s | 79 | 5 | +| 2011-06-04 | 2011-06-04-hr1.mp3 | 42m42s | 31 | 1 | +| 2011-09-10 | 2011-09-10-hr1.mp3 | 41m46s | 4 | 0 | +| 2014-s6e05 | 2014-s6e05.mp3 | 47m27s | 153 | 3 | +| 2015-s7e30 | 2015-s7e30.mp3 | 45m21s | 105 | 5 | +| 2016-s8e42 | 2016-s8e42.mp3 | 90m24s | 227 | 5 | +| 2017-s9e26 | 2017-s9e26.mp3 | 89m25s | 374 | 5 | +| 2018-s10e17 | 2018-s10e17.mp3 | 88m22s | 816 | 0 | +| 2018-s10e21 | 2018-s10e21.mp3 | 88m20s | 454 | 3 | + +--- + +## Test Set — Downloaded, Not Yet Transcribed + +Files saved to `test-data/episodes/`. + +| Local Filename | Source Path on IX (172.16.3.10) | Size | Notes | +|---|---|---|---| +| 2011-03-12-hr1.mp3 | /home/gurushow/public_html/archive/2011/3-12-11 HR 1.mp3 | 8.8MB | 2011 unseen date | +| 2012-03-10-hr1.mp3 | /home/gurushow/public_html/archive/2012/3 - March/3-10-12HR1.mp3 | 11.7MB | 2012 — completely untrained year | +| 2012-06-09-hr1.mp3 | /home/gurushow/public_html/archive/2012/6 - June/6-9-12-HR1.mp3 | 12.2MB | 2012 — completely untrained year | +| 2014-s6e19.mp3 | /home/gurushow/public_html/archive/2014/06/s6e19.mp3 | 10.3MB | 2014 different episode | +| 2016-s8e43.mp3 | /home/gurushow/public_html/archive/2016/06/s8e43.mp3 | 18.0MB | 2016 different episode | +| 2017-s9e30.mp3 | /home/gurushow/public_html/archive/2017/04/s9e30.mp3 | 48.2MB | 2017 different episode | + +--- + +## Next Steps + +1. Transcribe test episodes: `py src/cli.py batch --transcribe-only test-data/episodes/` +2. Diarize test episodes: run diarize script targeting `test-data/episodes/` +3. Extract Q&A pairs from test set +4. Compare Q&A quality vs training set +5. Performance comparison vs RTX 4090 (separate session on that machine) + +--- + +## RTX 4090 Performance Comparison (Separate Machine) + +The 4090 machine needs: + +- Full repo clone from Gitea +- `voice-profiles/` directory (contains mike-swanson composite + 180 embeddings) +- The 6 test episode MP3s from `test-data/episodes/` +- Run: `TRANSFORMERS_OFFLINE=1 py diarize_2018.py` against test episodes, record realtime factor +- Compare to 5070 Ti baseline: **149.5x realtime** (10,600s audio in 70.9s) + +--- + +## Infrastructure Notes + +**Archive server:** +- IX server: 172.16.3.10 (see vault: `infrastructure/ix-server.sops.yaml`) +- SSH blocked from command line due to key agent interference — use Python paramiko with `look_for_keys=False, allow_agent=False` +- Tailscale must be running for 172.16.3.x access +- Full archive: 579 MP3 files across `/home/gurushow/public_html/archive/{2010,2011,2012,2014,2015,2016,2017,2018}/` diff --git a/projects/radio-show/audio-processor/src/clip_extractor.py b/projects/radio-show/audio-processor/src/clip_extractor.py new file mode 100644 index 0000000..075ef5d --- /dev/null +++ b/projects/radio-show/audio-processor/src/clip_extractor.py @@ -0,0 +1,105 @@ +""" +Audio clip extraction using ffmpeg. +Cuts clips from original broadcast MP3s for use in Audition/Audacity. +""" + +import subprocess +from pathlib import Path + +from rich.console import Console + +console = Console() + + +def extract_clip( + source_path: Path, + start: float, + end: float, + output_path: Path, + padding: float = 1.5, + fade_ms: int = 200, +) -> Path: + """ + Extract a clip from source_path between start and end seconds. + Adds padding on both sides and applies fade in/out. + Returns the output path. + """ + source_path = Path(source_path) + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + clip_start = max(0.0, start - padding) + clip_end = end + padding + duration = clip_end - clip_start + + fade_s = fade_ms / 1000.0 + + cmd = [ + "ffmpeg", "-y", + "-ss", f"{clip_start:.3f}", + "-i", str(source_path), + "-t", f"{duration:.3f}", + "-af", f"afade=t=in:st=0:d={fade_s},afade=t=out:st={duration - fade_s:.3f}:d={fade_s}", + "-q:a", "2", + str(output_path), + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {result.stderr[-500:]}") + + return output_path + + +def extract_clips_for_results(results, output_dir: Path, padding: float = 1.5) -> dict[int, Path]: + """ + Extract clips for a list of QAResult or SearchResult objects. + Returns {index: clip_path}. + """ + output_dir = Path(output_dir) + clip_paths = {} + + for i, result in enumerate(results): + episode = result.episode_id + audio_path = Path(result.audio_path) + + if not audio_path.exists(): + console.print(f"[yellow]Audio not found: {audio_path}[/yellow]") + continue + + # Determine time range + if hasattr(result, "question_start"): + # QAResult + start = result.question_start + end = result.answer_end + else: + # SearchResult + start = result.start + end = result.end + + def fmt(s): + m, sec = divmod(int(s), 60) + h, m = divmod(m, 60) + return f"{h}h{m:02d}m{sec:02d}s" if h else f"{m}m{sec:02d}s" + + clip_name = f"{episode}_{fmt(start)}.mp3" + clip_path = output_dir / clip_name + + try: + extract_clip(audio_path, start, end, clip_path, padding=padding) + clip_paths[i] = clip_path + console.print(f"[green]Clip {i+1}:[/green] {clip_name}") + except Exception as e: + console.print(f"[red]Clip {i+1} failed:[/red] {e}") + + return clip_paths + + +def format_timestamp(seconds: float) -> str: + """Format seconds as H:MM:SS or M:SS.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + if h: + return f"{h}:{m:02d}:{s:02d}" + return f"{m}:{s:02d}" diff --git a/projects/radio-show/audio-processor/src/diarizer.py b/projects/radio-show/audio-processor/src/diarizer.py index f243eee..7c5a98a 100644 --- a/projects/radio-show/audio-processor/src/diarizer.py +++ b/projects/radio-show/audio-processor/src/diarizer.py @@ -158,117 +158,86 @@ def diarize(audio_path: str | Path, voice_profiles: VoiceProfileStore | None = None, min_speakers: int = 1, max_speakers: int = 6, - host_match_threshold: float = 0.75) -> DiarizationResult: - """Run speaker diarization on an audio file.""" - from pyannote.audio import Pipeline + host_match_threshold: float = 0.85) -> DiarizationResult: + """Run speaker diarization using WavLM sliding-window speaker identification. + + Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token + or gated model required. Identifies HOST vs non-HOST speakers using the + stored voice profile for Mike Swanson. + """ import torch + from .voice_profiler import VoiceProfiler audio_path = Path(audio_path) console.print(f"[bold]Diarizing:[/bold] {audio_path.name}") - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = "cuda" if torch.cuda.is_available() else "cpu" console.print(f"[dim]Device: {device}[/dim]") - pipeline = Pipeline.from_pretrained( - "pyannote/speaker-diarization-3.1" - ).to(device) + # Locate voice profiles directory from the VoiceProfileStore path + profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles") - diarization = pipeline( - str(audio_path), - min_speakers=min_speakers, - max_speakers=max_speakers, + profiler = VoiceProfiler(profiles_dir, device=device) + + if not profiler.profiles: + console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]") + # Return a single HOST turn covering the whole episode + from .voice_profiler import VoiceProfiler as VP + duration = profiler._get_duration(audio_path) + return DiarizationResult( + turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)], + num_speakers=1, + speaker_map={"HOST": "HOST"}, + ) + + # Sliding-window identification: 10s windows, 5s hop + voice_segs = profiler.identify_speakers( + audio_path, window_s=10.0, hop_s=5.0, + threshold=host_match_threshold, ) - # Extract turns + # Convert VoiceSegment labels to HOST / CALLER raw_turns = [] - for turn, _, speaker in diarization.itertracks(yield_label=True): + for seg in voice_segs: + label = seg.speaker_label.split(" (")[0] # strip confidence score + if label.startswith("Host:") or label.startswith("Host "): + speaker = "HOST" + elif label == "[error]": + speaker = "UNKNOWN" + else: + speaker = "CALLER" + raw_turns.append(SpeakerTurn( speaker=speaker, - start=turn.start, - end=turn.end, + start=seg.start, + end=seg.end, + confidence=float(seg.speaker_label.split("(")[-1].rstrip(")")) + if "(" in seg.speaker_label else 0.5, )) - # Count unique speakers - raw_speakers = set(t.speaker for t in raw_turns) - console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]") - - # Match against voice profiles if available - speaker_map = {} - if voice_profiles and voice_profiles.embeddings: - console.print("[dim]Matching speakers against voice profiles...[/dim]") - embedding_model = pipeline.embedding # pyannote's embedding model - - # Get embeddings for each detected speaker - from pyannote.audio import Inference - inference = Inference(pipeline.embedding, window="whole") - - for raw_label in raw_speakers: - # Get segments for this speaker - speaker_segments = [t for t in raw_turns if t.speaker == raw_label] - total_time = sum(t.duration for t in speaker_segments) - - # Use the longest segment for embedding - longest = max(speaker_segments, key=lambda t: t.duration) - - try: - # Extract embedding from audio segment - import torchaudio - waveform, sr = torchaudio.load( - str(audio_path), - frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000), - num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000), - ) - # This is simplified — proper implementation would use pyannote's - # embedding extraction pipeline - match_name, score = voice_profiles.match_embedding( - np.zeros(256), # placeholder - threshold=host_match_threshold, - ) - if match_name: - speaker_map[raw_label] = match_name - console.print(f" [green]{raw_label} -> {match_name} " - f"(score: {score:.2f}, {total_time:.0f}s)[/green]") - except Exception as e: - console.print(f" [yellow]Could not match {raw_label}: {e}[/yellow]") - - # If no voice profiles matched, use speaking time heuristic - # The host almost always has the most speaking time - if not speaker_map: - ranked = sorted( - [(s, sum(t.duration for t in raw_turns if t.speaker == s)) - for s in raw_speakers], - key=lambda x: x[1], - reverse=True, - ) - if ranked: - speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}" - console.print(f" [yellow]Assumed {ranked[0][0]} is host " - f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]") - - # If no voice profiles at all, label by speaking time - if not speaker_map: - ranked = sorted( - [(s, sum(t.duration for t in raw_turns if t.speaker == s)) - for s in raw_speakers], - key=lambda x: x[1], - reverse=True, - ) - for i, (speaker, time) in enumerate(ranked): - if i == 0: - speaker_map[speaker] = "Host (assumed)" - else: - speaker_map[speaker] = f"Speaker {i}" - - # Apply friendly names + # Merge consecutive same-speaker turns + merged: list[SpeakerTurn] = [] for turn in raw_turns: - if turn.speaker in speaker_map: - turn.speaker = speaker_map[turn.speaker] + if merged and merged[-1].speaker == turn.speaker: + merged[-1].end = turn.end + else: + merged.append(SpeakerTurn( + speaker=turn.speaker, + start=turn.start, + end=turn.end, + confidence=turn.confidence, + )) - console.print(f"[green]Diarization complete: {len(raw_turns)} turns, " - f"{len(raw_speakers)} speakers[/green]") + unique_speakers = set(t.speaker for t in merged) + speaker_map = {s: s for s in unique_speakers} + + host_time = sum(t.duration for t in merged if t.speaker == "HOST") + caller_time = sum(t.duration for t in merged if t.speaker == "CALLER") + console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | " + f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s") return DiarizationResult( - turns=raw_turns, - num_speakers=len(raw_speakers), + turns=merged, + num_speakers=len(unique_speakers), speaker_map=speaker_map, ) diff --git a/projects/radio-show/audio-processor/src/indexer.py b/projects/radio-show/audio-processor/src/indexer.py new file mode 100644 index 0000000..1daae88 --- /dev/null +++ b/projects/radio-show/audio-processor/src/indexer.py @@ -0,0 +1,247 @@ +""" +Archive transcript index using SQLite FTS5. +Stores all transcript segments with speaker labels, searchable by keyword or phrase. +""" + +import json +import sqlite3 +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator + +from rich.console import Console + +console = Console() + +DB_SCHEMA = """ +CREATE TABLE IF NOT EXISTS episodes ( + id INTEGER PRIMARY KEY, + episode_id TEXT UNIQUE NOT NULL, -- e.g. "2016-s8e42" + date TEXT, -- "2016-03-15" + audio_path TEXT, -- absolute path to original MP3 + duration REAL, + hr INTEGER -- 1 or 2 (for split episodes) +); + +CREATE TABLE IF NOT EXISTS segments ( + id INTEGER PRIMARY KEY, + episode_id TEXT NOT NULL, + seg_index INTEGER NOT NULL, + start REAL NOT NULL, + end REAL NOT NULL, + speaker TEXT, -- "HOST", "CALLER", "UNKNOWN", "COMMERCIAL" + text TEXT NOT NULL, + FOREIGN KEY (episode_id) REFERENCES episodes(episode_id) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5( + text, + speaker UNINDEXED, + episode_id UNINDEXED, + seg_index UNINDEXED, + content='segments', + content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS segments_ai AFTER INSERT ON segments BEGIN + INSERT INTO segments_fts(rowid, text, speaker, episode_id, seg_index) + VALUES (new.id, new.text, new.speaker, new.episode_id, new.seg_index); +END; + +CREATE TABLE IF NOT EXISTS qa_pairs ( + id INTEGER PRIMARY KEY, + episode_id TEXT NOT NULL, + question_start REAL NOT NULL, + question_end REAL NOT NULL, + answer_start REAL NOT NULL, + answer_end REAL NOT NULL, + question_text TEXT NOT NULL, + answer_text TEXT NOT NULL, + topic TEXT, -- Ollama-tagged topic + topic_tags TEXT, -- JSON array of tags + FOREIGN KEY (episode_id) REFERENCES episodes(episode_id) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS qa_fts USING fts5( + question_text, + answer_text, + topic, + episode_id UNINDEXED, + content='qa_pairs', + content_rowid='id' +); + +CREATE TRIGGER IF NOT EXISTS qa_ai AFTER INSERT ON qa_pairs BEGIN + INSERT INTO qa_fts(rowid, question_text, answer_text, topic, episode_id) + VALUES (new.id, new.question_text, new.answer_text, new.topic, new.episode_id); +END; +""" + + +@dataclass +class SearchResult: + episode_id: str + date: str + start: float + end: float + speaker: str + text: str + audio_path: str + score: float = 0.0 + + def timestamp_str(self) -> str: + def fmt(s): + m, sec = divmod(int(s), 60) + h, m = divmod(m, 60) + return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}" + return f"{fmt(self.start)}–{fmt(self.end)}" + + +@dataclass +class QAResult: + episode_id: str + date: str + question_start: float + question_end: float + answer_start: float + answer_end: float + question_text: str + answer_text: str + topic: str + audio_path: str + + def clip_start(self, padding: float = 1.0) -> float: + return max(0.0, self.question_start - padding) + + def clip_end(self, padding: float = 1.0) -> float: + return self.answer_end + padding + + def timestamp_str(self) -> str: + def fmt(s): + m, sec = divmod(int(s), 60) + h, m = divmod(m, 60) + return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}" + return f"{fmt(self.question_start)}–{fmt(self.answer_end)}" + + def duration(self) -> float: + return self.answer_end - self.question_start + + +class ArchiveIndex: + def __init__(self, db_path: Path): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._conn = sqlite3.connect(str(self.db_path)) + self._conn.row_factory = sqlite3.Row + self._conn.executescript(DB_SCHEMA) + self._conn.commit() + + def close(self): + self._conn.close() + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + # ── Ingestion ────────────────────────────────────────────────────────── + + def add_episode(self, episode_id: str, audio_path: Path, + date: str = None, duration: float = None, hr: int = None): + self._conn.execute( + "INSERT OR IGNORE INTO episodes (episode_id, date, audio_path, duration, hr) " + "VALUES (?, ?, ?, ?, ?)", + (episode_id, date, str(audio_path), duration, hr) + ) + self._conn.commit() + + def add_segments(self, episode_id: str, segments: list[dict]): + """Add transcript segments. Each dict: {start, end, text, speaker}.""" + existing = self._conn.execute( + "SELECT COUNT(*) FROM segments WHERE episode_id = ?", (episode_id,) + ).fetchone()[0] + if existing: + return # already indexed + + self._conn.executemany( + "INSERT INTO segments (episode_id, seg_index, start, end, speaker, text) " + "VALUES (?, ?, ?, ?, ?, ?)", + [ + (episode_id, i, s["start"], s["end"], + s.get("speaker", "UNKNOWN"), s["text"]) + for i, s in enumerate(segments) + ] + ) + self._conn.commit() + + def add_qa_pair(self, episode_id: str, q_start: float, q_end: float, + a_start: float, a_end: float, question: str, answer: str, + topic: str = None, tags: list[str] = None): + self._conn.execute( + "INSERT INTO qa_pairs " + "(episode_id, question_start, question_end, answer_start, answer_end, " + "question_text, answer_text, topic, topic_tags) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + (episode_id, q_start, q_end, a_start, a_end, question, answer, + topic, json.dumps(tags or [])) + ) + self._conn.commit() + + # ── Search ───────────────────────────────────────────────────────────── + + def search(self, query: str, speaker_filter: str = None, + limit: int = 20) -> list[SearchResult]: + """Full-text search across all transcript segments.""" + speaker_clause = "" + params = [query, limit] + if speaker_filter: + speaker_clause = "AND s.speaker = ?" + params.insert(1, speaker_filter) + + rows = self._conn.execute(f""" + SELECT s.episode_id, e.date, s.start, s.end, s.speaker, s.text, + e.audio_path, rank + FROM segments_fts f + JOIN segments s ON s.id = f.rowid + JOIN episodes e ON e.episode_id = s.episode_id + WHERE segments_fts MATCH ? + {speaker_clause} + ORDER BY rank + LIMIT ? + """, params).fetchall() + + return [SearchResult( + episode_id=r["episode_id"], date=r["date"] or r["episode_id"], + start=r["start"], end=r["end"], speaker=r["speaker"], + text=r["text"], audio_path=r["audio_path"], score=r["rank"] + ) for r in rows] + + def search_qa(self, query: str, limit: int = 20) -> list[QAResult]: + """Search Q&A pairs — matches against question, answer, and topic.""" + rows = self._conn.execute(""" + SELECT q.episode_id, e.date, q.question_start, q.question_end, + q.answer_start, q.answer_end, q.question_text, q.answer_text, + q.topic, e.audio_path, rank + FROM qa_fts f + JOIN qa_pairs q ON q.id = f.rowid + JOIN episodes e ON e.episode_id = q.episode_id + WHERE qa_fts MATCH ? + ORDER BY rank + LIMIT ? + """, [query, limit]).fetchall() + + return [QAResult( + episode_id=r["episode_id"], date=r["date"] or r["episode_id"], + question_start=r["question_start"], question_end=r["question_end"], + answer_start=r["answer_start"], answer_end=r["answer_end"], + question_text=r["question_text"], answer_text=r["answer_text"], + topic=r["topic"] or "", audio_path=r["audio_path"] + ) for r in rows] + + def stats(self) -> dict: + return { + "episodes": self._conn.execute("SELECT COUNT(*) FROM episodes").fetchone()[0], + "segments": self._conn.execute("SELECT COUNT(*) FROM segments").fetchone()[0], + "qa_pairs": self._conn.execute("SELECT COUNT(*) FROM qa_pairs").fetchone()[0], + } diff --git a/projects/radio-show/audio-processor/src/qa_extractor.py b/projects/radio-show/audio-processor/src/qa_extractor.py new file mode 100644 index 0000000..8f307a2 --- /dev/null +++ b/projects/radio-show/audio-processor/src/qa_extractor.py @@ -0,0 +1,372 @@ +""" +Q&A pair extraction from diarized transcripts. + +Identifies exchanges where a CALLER asks a question and the HOST answers. +Outputs structured Q&A pairs with timestamps for clip extraction and indexing. +""" + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from rich.console import Console + +console = Console() + +# Phrases that signal a caller is asking a question +QUESTION_SIGNALS = [ + r"\?", + r"\bhow (do|can|should|would|does)\b", + r"\bwhat (is|are|should|can|do|does|about)\b", + r"\bwhy (is|are|does|do|would|should)\b", + r"\bis (it|there|this|that) (true|safe|possible|good|bad|worth)\b", + r"\bshould i\b", + r"\bcan you\b", + r"\bi (was wondering|wanted to ask|have a question)\b", +] + +QUESTION_PATTERN = re.compile("|".join(QUESTION_SIGNALS), re.IGNORECASE) + +# Minimum durations for a meaningful exchange +MIN_QUESTION_DURATION = 5.0 # seconds +MIN_ANSWER_DURATION = 15.0 # seconds +MAX_GAP_BETWEEN_QA = 30.0 # seconds between question end and answer start + +# ── Promo / bumper filter ────────────────────────────────────────────────── +# Promos evolve across years but preserve signature phrases. +# Weight 2 = highly distinctive (one match sufficient to filter). +# Weight 1 = semi-generic (need 2+ to filter). +# A question turn with total score >= PROMO_SCORE_THRESHOLD is suppressed. +PROMO_SCORE_THRESHOLD = 2 + +_PROMO_SIGS: list[tuple[re.Pattern, int]] = [ + # Highly distinctive — score 2 each + (re.compile(r"acquired a life of its own", re.I), 2), + (re.compile(r"simply desire a deeper", re.I), 2), + (re.compile(r"tame that beast", re.I), 2), + (re.compile(r"mike swanson will be back after", re.I), 2), + (re.compile(r"heaven forbid.{0,20}virus", re.I | re.DOTALL), 2), + (re.compile(r"mike swanson is answering all", re.I), 2), + # Semi-distinctive — score 1 each, need two to filter + (re.compile(r"\bcomputer running slow\b", re.I), 1), + (re.compile(r"\bafter these messages\b", re.I), 1), + (re.compile(r"\b790.?2040\b", re.I), 1), + (re.compile(r"\bgurushow\.com\b", re.I), 1), + (re.compile(r"\bcall in now\b", re.I), 1), + (re.compile(r"\bcomputer troubles\?", re.I), 1), + (re.compile(r"\bhardware installation\b", re.I), 1), +] + + +def _is_promo_or_bumper(text: str) -> bool: + """Return True if text scores above threshold on show promo/bumper signatures.""" + score = sum(w for pat, w in _PROMO_SIGS if pat.search(text)) + return score >= PROMO_SCORE_THRESHOLD + + +@dataclass +class QAPair: + question_start: float + question_end: float + answer_start: float + answer_end: float + question_text: str + answer_text: str + topic: Optional[str] = None + topic_tags: list[str] = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "question_start": self.question_start, + "question_end": self.question_end, + "answer_start": self.answer_start, + "answer_end": self.answer_end, + "question_text": self.question_text, + "answer_text": self.answer_text, + "topic": self.topic, + "topic_tags": self.topic_tags, + } + + def clip_start(self, padding: float = 1.5) -> float: + return max(0.0, self.question_start - padding) + + def clip_end(self, padding: float = 1.5) -> float: + return self.answer_end + padding + + def duration(self) -> float: + return self.answer_end - self.question_start + + +def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]: + """ + Extract caller Q&A pairs from diarized transcript segments. + + Each segment dict: {start, end, text, speaker} + Speaker values: "HOST", "CALLER", "UNKNOWN" + """ + pairs = [] + + # Group consecutive segments by speaker into speaker turns + turns = _merge_consecutive_speaker_turns(diarized_segments) + + # Check if diarization produced any non-HOST speakers + has_caller_labels = any(t["speaker"] in ("CALLER", "UNKNOWN") for t in turns) + + if not has_caller_labels: + # Diarization labels are absent or unreliable — fall back to text-pattern detection + return _extract_qa_text_only(turns) + + i = 0 + while i < len(turns): + turn = turns[i] + + # Look for a CALLER turn that looks like a question + if turn["speaker"] in ("CALLER", "UNKNOWN") and _looks_like_question(turn["text"]): + if _is_promo_or_bumper(turn["text"]): + i += 1 + continue + q_duration = turn["end"] - turn["start"] + if q_duration < MIN_QUESTION_DURATION: + i += 1 + continue + + # Look ahead for HOST answer turn(s) + j = i + 1 + answer_turns = [] + while j < len(turns): + next_turn = turns[j] + gap = next_turn["start"] - turns[j - 1]["end"] + + if gap > MAX_GAP_BETWEEN_QA and not answer_turns: + break # too big a gap before any answer + + if next_turn["speaker"] == "HOST": + answer_turns.append(next_turn) + # Keep collecting consecutive HOST turns + j += 1 + while j < len(turns) and turns[j]["speaker"] == "HOST": + answer_turns.append(turns[j]) + j += 1 + break + elif next_turn["speaker"] in ("CALLER", "UNKNOWN"): + # Another caller turn before host answered — skip this question + break + else: + j += 1 + + if answer_turns: + answer_text = " ".join(t["text"] for t in answer_turns) + answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"] + + if answer_duration >= MIN_ANSWER_DURATION: + pairs.append(QAPair( + question_start=turn["start"], + question_end=turn["end"], + answer_start=answer_turns[0]["start"], + answer_end=answer_turns[-1]["end"], + question_text=turn["text"].strip(), + answer_text=answer_text.strip(), + )) + i = j + continue + + i += 1 + + return pairs + + +# Maximum duration for a question turn in text-only mode — avoids capturing monologues +_MAX_QUESTION_S_TEXT_MODE = 90.0 + +# Caller introduction phrases Mike uses before taking a call +_CALLER_INTRO = re.compile( + r"\b(let'?s go to|going to the phones?|you'?re on the air|on the air|" + r"first caller|next caller|caller from|go ahead|what'?s (your question|going on)|" + r"welcome to the show|thanks for calling|thank you for calling|" + r"our (first|next|last) (caller|call)|taking (a |your )?call)\b", + re.IGNORECASE, +) + + +def _extract_qa_text_only(turns: list[dict]) -> list[QAPair]: + """ + Q&A extraction when speaker labels are unavailable or all HOST. + + Uses text patterns to identify question anchors. Works well for call-in + radio format where callers describe problems and the host answers at length. + Captures both genuine caller questions and Mike's own rhetorical Q&A segments. + """ + pairs = [] + + i = 0 + while i < len(turns): + turn = turns[i] + q_duration = turn["end"] - turn["start"] + + is_q_candidate = ( + _looks_like_question(turn["text"]) + and MIN_QUESTION_DURATION <= q_duration <= _MAX_QUESTION_S_TEXT_MODE + ) + + # Also treat segments immediately after a caller-intro phrase as candidates + if not is_q_candidate and i > 0: + prev_text = turns[i - 1]["text"] + if _CALLER_INTRO.search(prev_text) and q_duration >= MIN_QUESTION_DURATION: + is_q_candidate = True + + if is_q_candidate and _is_promo_or_bumper(turn["text"]): + i += 1 + continue + + if is_q_candidate: + # Collect following segments as the answer until we hit another question + j = i + 1 + answer_turns = [] + + while j < len(turns): + next_turn = turns[j] + gap = next_turn["start"] - turns[j - 1]["end"] + + if gap > MAX_GAP_BETWEEN_QA and not answer_turns: + break + + # Stop collecting if we hit another short question-pattern turn + if ( + _looks_like_question(next_turn["text"]) + and (next_turn["end"] - next_turn["start"]) <= _MAX_QUESTION_S_TEXT_MODE + and answer_turns + ): + break + + answer_turns.append(next_turn) + j += 1 + + # Stop once we have a substantial answer block + if answer_turns: + ans_dur = answer_turns[-1]["end"] - answer_turns[0]["start"] + if ans_dur >= MIN_ANSWER_DURATION * 3: + break + + if answer_turns: + answer_text = " ".join(t["text"] for t in answer_turns) + answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"] + + if answer_duration >= MIN_ANSWER_DURATION: + pairs.append(QAPair( + question_start=turn["start"], + question_end=turn["end"], + answer_start=answer_turns[0]["start"], + answer_end=answer_turns[-1]["end"], + question_text=turn["text"].strip(), + answer_text=answer_text.strip(), + )) + i = j + continue + + i += 1 + + return pairs + + +def tag_qa_pairs_with_ollama(pairs: list[QAPair], ollama_host: str = "http://localhost:11434", + model: str = "qwen3:14b") -> list[QAPair]: + """Use Ollama to tag each Q&A pair with a topic and tags.""" + try: + import ollama + client = ollama.Client(host=ollama_host) + except ImportError: + console.print("[yellow]ollama not installed — skipping topic tagging[/yellow]") + return pairs + + for i, pair in enumerate(pairs): + console.print(f"[dim]Tagging Q&A {i+1}/{len(pairs)}...[/dim]") + try: + prompt = ( + f"A radio show caller asked:\n\"{pair.question_text[:300]}\"\n\n" + f"The host answered:\n\"{pair.answer_text[:500]}\"\n\n" + "Respond with JSON only, no explanation:\n" + '{"topic": "short topic name (3-5 words)", "tags": ["tag1", "tag2", "tag3"]}' + ) + resp = client.chat( + model=model, + messages=[{"role": "user", "content": prompt}], + options={"temperature": 0}, + ) + raw = resp["message"]["content"].strip() + # Extract JSON from response + start = raw.find("{") + end = raw.rfind("}") + 1 + if start >= 0 and end > start: + data = json.loads(raw[start:end]) + pair.topic = data.get("topic", "") + pair.topic_tags = data.get("tags", []) + except Exception as e: + console.print(f"[yellow]Tagging failed for pair {i+1}: {e}[/yellow]") + + return pairs + + +def load_diarized_transcript(transcript_path: Path, + diarization_path: Optional[Path]) -> list[dict]: + """ + Merge transcript and diarization into speaker-labeled segments. + Falls back to HOST-only if no diarization available. + """ + with open(transcript_path) as f: + transcript = json.load(f) + + segments = transcript["segments"] + + if diarization_path is None or not diarization_path.exists(): + return [ + {"start": s["start"], "end": s["end"], + "text": s["text"], "speaker": "HOST"} + for s in segments + ] + + with open(diarization_path) as f: + diarization = json.load(f) + + turns = diarization.get("turns", []) + + def speaker_at(t: float) -> str: + """Find which diarization turn covers time t.""" + for turn in turns: + if turn["start"] <= t <= turn["end"]: + return turn["speaker"] + return "UNKNOWN" + + return [ + {"start": s["start"], "end": s["end"], + "text": s["text"], + "speaker": speaker_at((s["start"] + s["end"]) / 2)} + for s in segments + ] + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +def _looks_like_question(text: str) -> bool: + return bool(QUESTION_PATTERN.search(text)) + + +def _merge_consecutive_speaker_turns(segments: list[dict]) -> list[dict]: + """Merge adjacent segments from the same speaker into continuous turns.""" + if not segments: + return [] + + turns = [] + current = dict(segments[0]) + + for seg in segments[1:]: + if seg["speaker"] == current["speaker"]: + current["end"] = seg["end"] + current["text"] = current["text"].rstrip() + " " + seg["text"].lstrip() + else: + turns.append(current) + current = dict(seg) + + turns.append(current) + return turns diff --git a/projects/radio-show/audio-processor/src/show_prep.py b/projects/radio-show/audio-processor/src/show_prep.py new file mode 100644 index 0000000..ce28792 --- /dev/null +++ b/projects/radio-show/audio-processor/src/show_prep.py @@ -0,0 +1,206 @@ +""" +Show prep generator: search the archive index for past caller topics, +extract clips, and generate "then vs now" talking points via Ollama. +""" + +import json +from pathlib import Path +from datetime import datetime + +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich import box + +from .indexer import ArchiveIndex, QAResult, SearchResult +from .clip_extractor import extract_clips_for_results, format_timestamp + +console = Console() + + +def generate_show_prep( + index: ArchiveIndex, + topic: str, + output_dir: Path, + extract_clips: bool = True, + ollama_host: str = "http://localhost:11434", + ollama_model: str = "qwen3:14b", + limit: int = 10, +) -> Path: + """ + Search the archive for past discussions of a topic. + Extracts audio clips and generates "then vs now" talking points. + Returns path to the generated markdown prep file. + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue")) + + # Search Q&A pairs first (caller exchanges) + qa_results = index.search_qa(topic, limit=limit) + # Also search raw segments (for monologue mentions) + segment_results = index.search(topic, limit=limit) + + if not qa_results and not segment_results: + console.print(f"[yellow]No results found for: {topic}[/yellow]") + return None + + # Display results table + _print_results_table(qa_results, segment_results, topic) + + # Extract clips + clip_paths = {} + if extract_clips and qa_results: + clips_dir = output_dir / "clips" + console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]") + clip_paths = extract_clips_for_results(qa_results, clips_dir) + + # Generate then-vs-now content via Ollama + then_now = _generate_then_vs_now(topic, qa_results, segment_results, + ollama_host, ollama_model) + + # Write markdown prep file + safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40] + date_str = datetime.now().strftime("%Y-%m-%d") + prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md" + + _write_prep_file(prep_path, topic, qa_results, segment_results, + clip_paths, then_now) + + console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}") + return prep_path + + +def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult], + topic: str): + if qa_results: + table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True) + table.add_column("Date", style="cyan", width=12) + table.add_column("Timestamps", style="dim", width=14) + table.add_column("Duration", style="dim", width=8) + table.add_column("Caller asked", width=35) + table.add_column("Topic", style="green", width=20) + + for r in qa_results: + dur = r.duration() + table.add_row( + r.date or r.episode_id, + r.timestamp_str(), + f"{int(dur//60)}m{int(dur%60):02d}s", + r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""), + r.topic or "—", + ) + console.print(table) + + if segment_results and not qa_results: + console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} " + f"transcript mentions:[/dim]") + for r in segment_results: + console.print(f" [cyan]{r.date}[/cyan] [{r.timestamp_str()}] " + f"[dim]{r.speaker}[/dim]: {r.text[:100]}…") + + +def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list, + ollama_host: str, model: str) -> str: + try: + import ollama + client = ollama.Client(host=ollama_host) + except ImportError: + return "_Ollama not available — install with: pip install ollama_" + + # Build context from past discussions + past_context = "" + for r in qa_results[:5]: + date = r.date or r.episode_id + past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n" + past_context += f"Host answer: {r.answer_text[:400]}\n" + + if not past_context and segment_results: + for r in segment_results[:5]: + past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n" + + if not past_context: + return "" + + prompt = f"""You are helping prepare talking points for a technology radio show host. +The host discussed "{topic}" in past episodes. Here are excerpts: + +{past_context} + +The host wants to do a new segment revisiting this topic. + +Write talking points in this format: +## What I Said Then +- [2-3 bullets summarizing the past advice/position] + +## What's Changed Since Then +- [2-3 bullets on how the technology/situation has evolved] + +## Why My Answer Is Different Now +- [2-3 bullets on the updated recommendation/position] + +## Suggested Opening +[1-2 sentences the host can use to open the segment, referencing the old clip] + +Keep it conversational, radio-friendly. Be specific about what actually changed.""" + + try: + resp = client.chat( + model=model, + messages=[{"role": "user", "content": prompt}], + options={"temperature": 0.3}, + ) + return resp["message"]["content"] + except Exception as e: + return f"_Ollama generation failed: {e}_" + + +def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list, + clip_paths: dict, then_now: str): + lines = [ + f"# Show Prep: {topic}", + f"", + f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_", + f"", + ] + + if qa_results: + lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""] + for i, r in enumerate(qa_results): + clip_info = "" + if i in clip_paths: + clip_info = f" — `{clip_paths[i].name}`" + lines += [ + f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}", + f"**Caller:** {r.question_text}", + f"", + f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}", + f"", + ] + + elif segment_results: + lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""] + for r in segment_results: + lines += [ + f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}", + ] + lines.append("") + + if then_now: + lines += ["## Then vs Now", "", then_now, ""] + + if clip_paths: + lines += [ + "## Clips", + "", + f"Extracted to `clips/` — drag into Audition/Audacity:", + "", + ] + for i, p in clip_paths.items(): + if i < len(qa_results): + r = qa_results[i] + lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]") + lines.append("") + + path.write_text("\n".join(lines), encoding="utf-8") diff --git a/projects/radio-show/audio-processor/src/voice_profiler.py b/projects/radio-show/audio-processor/src/voice_profiler.py index 991edbd..27b1d4a 100644 --- a/projects/radio-show/audio-processor/src/voice_profiler.py +++ b/projects/radio-show/audio-processor/src/voice_profiler.py @@ -13,7 +13,6 @@ import numpy as np import torch import soundfile as sf from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn from rich.table import Table console = Console() @@ -159,36 +158,43 @@ class VoiceProfiler: def extract_embedding(self, audio_path: Path, start: float = 0.0, end: float | None = None) -> np.ndarray: - """Extract a speaker embedding from an audio segment.""" - model = self._get_model() + """Extract a speaker embedding from an audio segment (file-based, any format).""" + self._get_model() + waveform, _ = self._load_audio_segment(audio_path, start, end) + return self._embed_audio_np(waveform.squeeze(0).numpy()) - # Load audio segment (already at SAMPLE_RATE via ffmpeg) - waveform, sr = self._load_audio_segment(audio_path, start, end) - - # waveform is [1, samples] tensor, need just the numpy array for the extractor - audio_np = waveform.squeeze(0).numpy() - - # Extract features + def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray: + """Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding.""" + self._get_model() inputs = self._extractor( audio_np, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True, ) - - # Get embedding with torch.no_grad(): - outputs = model(**{k: v.to(self.device) for k, v in inputs.items()}) - + outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()}) embedding = outputs.embeddings.squeeze().cpu().numpy() - # L2 normalize norm = np.linalg.norm(embedding) if norm > 0: embedding = embedding / norm - return embedding + def _load_full_audio(self, audio_path: Path) -> np.ndarray: + """Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call.""" + cmd = [ + "ffmpeg", "-i", str(audio_path), + "-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE), + "-acodec", "pcm_s16le", "pipe:1", + ] + result = subprocess.run(cmd, capture_output=True, timeout=600) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}") + import io + data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32") + return data # shape: (samples,) + def _load_audio_segment(self, audio_path: Path, start: float = 0.0, end: float | None = None) -> tuple[torch.Tensor, int]: - """Load an audio segment using ffmpeg (handles any format).""" + """Load a single audio segment via ffmpeg (used for one-off extraction).""" cmd = ["ffmpeg", "-i", str(audio_path)] if start > 0: cmd.extend(["-ss", str(start)]) @@ -227,68 +233,39 @@ class VoiceProfiler: profile = self.profiles[host_name] - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("{task.completed}/{task.total}"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Processing episodes...", - total=len(episode_paths)) + for ep_idx, ep_path in enumerate(episode_paths, 1): + console.print(f"[dim] [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]") - for ep_path in episode_paths: - progress.update(task, description=f"Processing {ep_path.name}...") + try: + duration = self._get_duration(ep_path) - try: - # Get episode duration - duration = self._get_duration(ep_path) + windows = [] + if duration > 90: + windows.append((30.0, 90.0)) + if duration > 180: + windows.append((120.0, 180.0)) + mid = duration / 2 + if mid > 60: + windows.append((mid, min(mid + 60, duration))) + late = duration - 180 + if late > 300: + windows.append((late, late + 60)) - # Strategy: extract embeddings from multiple time windows - # Skip first 30s (likely intro jingle), then sample every 2 min - windows = [] + chunk_duration = 10.0 + for start, end in windows: + for chunk_start in np.arange(start, end - chunk_duration, chunk_duration): + try: + emb = self.extract_embedding( + ep_path, chunk_start, chunk_start + chunk_duration + ) + profile.embeddings.append(emb) + except Exception as e: + console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]") - # Window 1: After intro (30s-90s) — usually host monologue - if duration > 90: - windows.append((30.0, 90.0)) + profile.source_episodes.append(ep_path.name) - # Window 2: Early show (2min-3min) - if duration > 180: - windows.append((120.0, 180.0)) - - # Window 3: Mid show - mid = duration / 2 - if mid > 60: - windows.append((mid, min(mid + 60, duration))) - - # Window 4: Late show (but not last 2 min — likely outro) - late = duration - 180 - if late > 300: - windows.append((late, late + 60)) - - for start, end in windows: - # Extract 10-second chunks within each window - # and take the embedding of each chunk - chunk_duration = 10.0 - for chunk_start in np.arange(start, end - chunk_duration, - chunk_duration): - try: - emb = self.extract_embedding( - ep_path, chunk_start, - chunk_start + chunk_duration - ) - profile.embeddings.append(emb) - except Exception as e: - console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]") - continue - - profile.source_episodes.append(ep_path.name) - - except Exception as e: - console.print(f" [red]Failed: {ep_path.name}: {e}[/red]") - - progress.update(task, advance=1) + except Exception as e: + console.print(f" [red]Failed: {ep_path.name}: {e}[/red]") # Compute composite profile.compute_composite() @@ -305,58 +282,66 @@ class VoiceProfiler: threshold: float = 0.70) -> list[VoiceSegment]: """Identify speakers throughout an audio file using sliding window. + Loads the full audio once then slices in memory — avoids spawning + hundreds of ffmpeg subprocesses. Returns timestamped segments with speaker labels and embeddings. """ console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}") duration = self._get_duration(audio_path) + console.print(f"[dim]Loading audio into memory...[/dim]") + audio = self._load_full_audio(audio_path) # float32 mono array + self._get_model() # ensure model is warm before the loop + segments = [] + window_samples = int(window_s * SAMPLE_RATE) + hop_samples = int(hop_s * SAMPLE_RATE) + total_samples = len(audio) - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Analyzing speakers...", - total=int(duration)) + total_windows = int((duration - window_s) / hop_s) + 1 + report_every = max(1, total_windows // 10) - for start in np.arange(0, duration - window_s, hop_s): - end = min(start + window_s, duration) + for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)): + end = min(start + window_s, duration) + s = int(start * SAMPLE_RATE) + e = min(s + window_samples, total_samples) - try: - emb = self.extract_embedding(audio_path, start, end) + try: + emb = self._embed_audio_np(audio[s:e]) - # Match against known profiles - best_match = None - best_score = 0.0 + best_match = None + best_score = 0.0 - for name, profile in self.profiles.items(): - score = profile.similarity(emb) - if score > best_score: - best_score = score - best_match = name + for name, profile in self.profiles.items(): + score = profile.similarity(emb) + if score > best_score: + best_score = score + best_match = name - label = best_match if best_score >= threshold else "Unknown" + if best_score >= threshold: if best_match and self.profiles[best_match].role == "host": label = f"Host: {best_match}" + else: + label = best_match + else: + label = "Unknown" - segments.append(VoiceSegment( - start=start, - end=end, - embedding=emb, - speaker_label=f"{label} ({best_score:.2f})", - )) + segments.append(VoiceSegment( + start=start, + end=end, + embedding=emb, + speaker_label=f"{label} ({best_score:.2f})", + )) - except Exception: - segments.append(VoiceSegment( - start=start, end=end, - speaker_label="[error]", - )) + except Exception: + segments.append(VoiceSegment( + start=start, end=end, + speaker_label="[error]", + )) - progress.update(task, completed=int(end)) + if idx % report_every == 0: + pct = int(end / duration * 100) + console.print(f"[dim] {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]") # Print summary self._print_speaker_summary(segments, duration) diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2010-10-02-hr1/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2010-10-02-hr1/diarization.json new file mode 100644 index 0000000..04cf07b --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2010-10-02-hr1/diarization.json @@ -0,0 +1,327 @@ +{ + "num_speakers": 2, + "speaker_map": { + "HOST": "HOST", + "CALLER": "CALLER" + }, + "turns": [ + { + "speaker": "HOST", + "start": 0.0, + "end": 30.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 25.0, + "end": 35.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 30.0, + "end": 105.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 100.0, + "end": 115.0, + "confidence": 0.56 + }, + { + "speaker": "HOST", + "start": 110.0, + "end": 160.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 155.0, + "end": 170.0, + "confidence": 0.64 + }, + { + "speaker": "HOST", + "start": 165.0, + "end": 210.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 205.0, + "end": 225.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 220.0, + "end": 235.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 230.0, + "end": 240.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 235.0, + "end": 290.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 285.0, + "end": 320.0, + "confidence": 0.55 + }, + { + "speaker": "HOST", + "start": 315.0, + "end": 415.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 410.0, + "end": 420.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 415.0, + "end": 440.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 435.0, + "end": 455.0, + "confidence": 0.57 + }, + { + "speaker": "HOST", + "start": 450.0, + "end": 530.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 525.0, + "end": 535.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 530.0, + "end": 595.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 590.0, + "end": 605.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 600.0, + "end": 870.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 865.0, + "end": 930.0, + "confidence": 0.6 + }, + { + "speaker": "HOST", + "start": 925.0, + "end": 945.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 940.0, + "end": 970.0, + "confidence": 0.54 + }, + { + "speaker": "HOST", + "start": 965.0, + "end": 1015.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 1010.0, + "end": 1040.0, + "confidence": 0.49 + }, + { + "speaker": "HOST", + "start": 1035.0, + "end": 1060.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 1055.0, + "end": 1070.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 1065.0, + "end": 1105.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 1100.0, + "end": 1120.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 1115.0, + "end": 1225.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1220.0, + "end": 1230.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1225.0, + "end": 1265.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1260.0, + "end": 1275.0, + "confidence": 0.62 + }, + { + "speaker": "HOST", + "start": 1270.0, + "end": 1310.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 1305.0, + "end": 1320.0, + "confidence": 0.57 + }, + { + "speaker": "HOST", + "start": 1315.0, + "end": 1340.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 1335.0, + "end": 1350.0, + "confidence": 0.57 + }, + { + "speaker": "HOST", + "start": 1345.0, + "end": 1705.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1700.0, + "end": 1725.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 1720.0, + "end": 1825.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1820.0, + "end": 1840.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 1835.0, + "end": 1900.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 1895.0, + "end": 1925.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 1920.0, + "end": 1935.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1930.0, + "end": 1940.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 1935.0, + "end": 1955.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 1950.0, + "end": 1960.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 1955.0, + "end": 2050.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2045.0, + "end": 2055.0, + "confidence": 0.49 + }, + { + "speaker": "HOST", + "start": 2050.0, + "end": 2455.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 2450.0, + "end": 2465.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2460.0, + "end": 2675.0, + "confidence": 0.87 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2011-06-04-hr1/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2011-06-04-hr1/diarization.json new file mode 100644 index 0000000..833ab81 --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2011-06-04-hr1/diarization.json @@ -0,0 +1,111 @@ +{ + "num_speakers": 2, + "speaker_map": { + "HOST": "HOST", + "CALLER": "CALLER" + }, + "turns": [ + { + "speaker": "HOST", + "start": 0.0, + "end": 20.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 15.0, + "end": 35.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 30.0, + "end": 475.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 470.0, + "end": 495.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 490.0, + "end": 635.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 630.0, + "end": 640.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 635.0, + "end": 720.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 715.0, + "end": 725.0, + "confidence": 0.62 + }, + { + "speaker": "HOST", + "start": 720.0, + "end": 1310.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 1305.0, + "end": 1320.0, + "confidence": 0.7 + }, + { + "speaker": "HOST", + "start": 1315.0, + "end": 1565.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1560.0, + "end": 1570.0, + "confidence": 0.68 + }, + { + "speaker": "HOST", + "start": 1565.0, + "end": 2055.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2050.0, + "end": 2060.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 2055.0, + "end": 2080.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 2075.0, + "end": 2090.0, + "confidence": 0.76 + }, + { + "speaker": "HOST", + "start": 2085.0, + "end": 2560.0, + "confidence": 0.85 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2011-09-10-hr1/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2011-09-10-hr1/diarization.json new file mode 100644 index 0000000..9f115f6 --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2011-09-10-hr1/diarization.json @@ -0,0 +1,39 @@ +{ + "num_speakers": 2, + "speaker_map": { + "HOST": "HOST", + "CALLER": "CALLER" + }, + "turns": [ + { + "speaker": "HOST", + "start": 0.0, + "end": 20.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 15.0, + "end": 25.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 20.0, + "end": 1855.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1850.0, + "end": 1860.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 1855.0, + "end": 2505.0, + "confidence": 0.93 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2014-s6e05/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2014-s6e05/diarization.json new file mode 100644 index 0000000..0448d71 --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2014-s6e05/diarization.json @@ -0,0 +1,189 @@ +{ + "num_speakers": 2, + "speaker_map": { + "CALLER": "CALLER", + "HOST": "HOST" + }, + "turns": [ + { + "speaker": "CALLER", + "start": 0.0, + "end": 40.0, + "confidence": 0.61 + }, + { + "speaker": "HOST", + "start": 35.0, + "end": 530.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 525.0, + "end": 540.0, + "confidence": 0.66 + }, + { + "speaker": "HOST", + "start": 535.0, + "end": 595.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 590.0, + "end": 630.0, + "confidence": 0.64 + }, + { + "speaker": "HOST", + "start": 625.0, + "end": 1620.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 1615.0, + "end": 1640.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 1635.0, + "end": 1715.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 1710.0, + "end": 1730.0, + "confidence": 0.74 + }, + { + "speaker": "HOST", + "start": 1725.0, + "end": 1790.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 1785.0, + "end": 1815.0, + "confidence": 0.66 + }, + { + "speaker": "HOST", + "start": 1810.0, + "end": 1820.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 1815.0, + "end": 1835.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 1830.0, + "end": 1845.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 1840.0, + "end": 1850.0, + "confidence": 0.67 + }, + { + "speaker": "HOST", + "start": 1845.0, + "end": 1910.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 1905.0, + "end": 1925.0, + "confidence": 0.72 + }, + { + "speaker": "HOST", + "start": 1920.0, + "end": 1940.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 1935.0, + "end": 1960.0, + "confidence": 0.66 + }, + { + "speaker": "HOST", + "start": 1955.0, + "end": 1985.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 1980.0, + "end": 2000.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 1995.0, + "end": 2065.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 2060.0, + "end": 2070.0, + "confidence": 0.74 + }, + { + "speaker": "HOST", + "start": 2065.0, + "end": 2190.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2185.0, + "end": 2220.0, + "confidence": 0.63 + }, + { + "speaker": "HOST", + "start": 2215.0, + "end": 2370.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2365.0, + "end": 2375.0, + "confidence": 0.61 + }, + { + "speaker": "HOST", + "start": 2370.0, + "end": 2770.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 2765.0, + "end": 2780.0, + "confidence": 0.76 + }, + { + "speaker": "HOST", + "start": 2775.0, + "end": 2845.0, + "confidence": 0.96 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2015-s7e30/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2015-s7e30/diarization.json new file mode 100644 index 0000000..2dd15fa --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2015-s7e30/diarization.json @@ -0,0 +1,177 @@ +{ + "num_speakers": 2, + "speaker_map": { + "CALLER": "CALLER", + "HOST": "HOST" + }, + "turns": [ + { + "speaker": "CALLER", + "start": 0.0, + "end": 45.0, + "confidence": 0.66 + }, + { + "speaker": "HOST", + "start": 40.0, + "end": 655.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 650.0, + "end": 690.0, + "confidence": 0.64 + }, + { + "speaker": "HOST", + "start": 685.0, + "end": 1350.0, + "confidence": 0.99 + }, + { + "speaker": "CALLER", + "start": 1345.0, + "end": 1380.0, + "confidence": 0.54 + }, + { + "speaker": "HOST", + "start": 1375.0, + "end": 1395.0, + "confidence": 0.99 + }, + { + "speaker": "CALLER", + "start": 1390.0, + "end": 1440.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1435.0, + "end": 1485.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 1480.0, + "end": 1495.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 1490.0, + "end": 1515.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 1510.0, + "end": 1540.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 1535.0, + "end": 1625.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 1620.0, + "end": 1635.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 1630.0, + "end": 1720.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1715.0, + "end": 1725.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 1720.0, + "end": 1860.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 1855.0, + "end": 1870.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 1865.0, + "end": 2015.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 2010.0, + "end": 2035.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 2030.0, + "end": 2055.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2050.0, + "end": 2070.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2065.0, + "end": 2075.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2070.0, + "end": 2085.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2080.0, + "end": 2105.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 2100.0, + "end": 2110.0, + "confidence": 0.77 + }, + { + "speaker": "HOST", + "start": 2105.0, + "end": 2345.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 2340.0, + "end": 2390.0, + "confidence": 0.68 + }, + { + "speaker": "HOST", + "start": 2385.0, + "end": 2720.0, + "confidence": 0.92 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2016-s8e42/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2016-s8e42/diarization.json new file mode 100644 index 0000000..f0579fe --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2016-s8e42/diarization.json @@ -0,0 +1,393 @@ +{ + "num_speakers": 2, + "speaker_map": { + "CALLER": "CALLER", + "HOST": "HOST" + }, + "turns": [ + { + "speaker": "CALLER", + "start": 0.0, + "end": 40.0, + "confidence": 0.68 + }, + { + "speaker": "HOST", + "start": 35.0, + "end": 380.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 375.0, + "end": 385.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 380.0, + "end": 685.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 680.0, + "end": 700.0, + "confidence": 0.61 + }, + { + "speaker": "HOST", + "start": 695.0, + "end": 705.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 700.0, + "end": 710.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 705.0, + "end": 1235.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1230.0, + "end": 1240.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1235.0, + "end": 1365.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 1360.0, + "end": 1410.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 1405.0, + "end": 1430.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 1425.0, + "end": 1435.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 1430.0, + "end": 1440.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1435.0, + "end": 1455.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 1450.0, + "end": 1475.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 1470.0, + "end": 1480.0, + "confidence": 0.77 + }, + { + "speaker": "HOST", + "start": 1475.0, + "end": 2155.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2150.0, + "end": 2185.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 2180.0, + "end": 2195.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 2190.0, + "end": 2205.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2200.0, + "end": 2285.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 2280.0, + "end": 2295.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2290.0, + "end": 2505.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 2500.0, + "end": 2510.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 2505.0, + "end": 2535.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2530.0, + "end": 2565.0, + "confidence": 0.45 + }, + { + "speaker": "HOST", + "start": 2560.0, + "end": 2665.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2660.0, + "end": 2670.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2665.0, + "end": 3040.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3035.0, + "end": 3045.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 3040.0, + "end": 3360.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 3355.0, + "end": 3370.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 3365.0, + "end": 3385.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 3380.0, + "end": 3395.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 3390.0, + "end": 3475.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 3470.0, + "end": 3480.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3475.0, + "end": 3520.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3515.0, + "end": 3550.0, + "confidence": 0.59 + }, + { + "speaker": "HOST", + "start": 3545.0, + "end": 3555.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 3550.0, + "end": 3660.0, + "confidence": 0.56 + }, + { + "speaker": "HOST", + "start": 3655.0, + "end": 3695.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3690.0, + "end": 3705.0, + "confidence": 0.49 + }, + { + "speaker": "HOST", + "start": 3700.0, + "end": 3730.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 3725.0, + "end": 3750.0, + "confidence": 0.6 + }, + { + "speaker": "HOST", + "start": 3745.0, + "end": 3800.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 3795.0, + "end": 3810.0, + "confidence": 0.57 + }, + { + "speaker": "HOST", + "start": 3805.0, + "end": 3900.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 3895.0, + "end": 3905.0, + "confidence": 0.46 + }, + { + "speaker": "HOST", + "start": 3900.0, + "end": 4080.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 4075.0, + "end": 4085.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 4080.0, + "end": 4210.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 4205.0, + "end": 4250.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 4245.0, + "end": 4595.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4590.0, + "end": 4600.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 4595.0, + "end": 4765.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 4760.0, + "end": 4830.0, + "confidence": 0.49 + }, + { + "speaker": "HOST", + "start": 4825.0, + "end": 4885.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 4880.0, + "end": 4910.0, + "confidence": 0.56 + }, + { + "speaker": "HOST", + "start": 4905.0, + "end": 4940.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4935.0, + "end": 4975.0, + "confidence": 0.52 + }, + { + "speaker": "HOST", + "start": 4970.0, + "end": 5125.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 5120.0, + "end": 5130.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 5125.0, + "end": 5420.0, + "confidence": 0.91 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2017-s9e26/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2017-s9e26/diarization.json new file mode 100644 index 0000000..9ee2d01 --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2017-s9e26/diarization.json @@ -0,0 +1,567 @@ +{ + "num_speakers": 2, + "speaker_map": { + "HOST": "HOST", + "CALLER": "CALLER" + }, + "turns": [ + { + "speaker": "HOST", + "start": 0.0, + "end": 20.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 15.0, + "end": 25.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 20.0, + "end": 90.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 85.0, + "end": 95.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 90.0, + "end": 195.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 190.0, + "end": 200.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 195.0, + "end": 565.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 560.0, + "end": 625.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 620.0, + "end": 630.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 625.0, + "end": 665.0, + "confidence": 0.6 + }, + { + "speaker": "HOST", + "start": 660.0, + "end": 1090.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 1085.0, + "end": 1095.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1090.0, + "end": 1165.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 1160.0, + "end": 1170.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 1165.0, + "end": 1345.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 1340.0, + "end": 1350.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 1345.0, + "end": 1525.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 1520.0, + "end": 1560.0, + "confidence": 0.69 + }, + { + "speaker": "HOST", + "start": 1555.0, + "end": 1920.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 1915.0, + "end": 1925.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 1920.0, + "end": 2050.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 2045.0, + "end": 2055.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2050.0, + "end": 2075.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 2070.0, + "end": 2085.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 2080.0, + "end": 2140.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 2135.0, + "end": 2145.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2140.0, + "end": 2295.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 2290.0, + "end": 2300.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2295.0, + "end": 2405.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2400.0, + "end": 2460.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2455.0, + "end": 2660.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2655.0, + "end": 2665.0, + "confidence": 0.67 + }, + { + "speaker": "HOST", + "start": 2660.0, + "end": 2715.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2710.0, + "end": 2725.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 2720.0, + "end": 2755.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 2750.0, + "end": 2760.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2755.0, + "end": 2970.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 2965.0, + "end": 3030.0, + "confidence": 0.76 + }, + { + "speaker": "HOST", + "start": 3025.0, + "end": 3045.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 3040.0, + "end": 3055.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 3050.0, + "end": 3105.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 3100.0, + "end": 3165.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 3160.0, + "end": 3205.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 3200.0, + "end": 3210.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 3205.0, + "end": 3385.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 3380.0, + "end": 3420.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3415.0, + "end": 3430.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 3425.0, + "end": 3445.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 3440.0, + "end": 3475.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 3470.0, + "end": 3490.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 3485.0, + "end": 3510.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 3505.0, + "end": 3555.0, + "confidence": 0.67 + }, + { + "speaker": "HOST", + "start": 3550.0, + "end": 3570.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 3565.0, + "end": 3590.0, + "confidence": 0.48 + }, + { + "speaker": "HOST", + "start": 3585.0, + "end": 4015.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 4010.0, + "end": 4020.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 4015.0, + "end": 4075.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 4070.0, + "end": 4130.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 4125.0, + "end": 4180.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 4175.0, + "end": 4200.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 4195.0, + "end": 4215.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4210.0, + "end": 4235.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 4230.0, + "end": 4240.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 4235.0, + "end": 4250.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 4245.0, + "end": 4270.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 4265.0, + "end": 4280.0, + "confidence": 0.76 + }, + { + "speaker": "HOST", + "start": 4275.0, + "end": 4295.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 4290.0, + "end": 4305.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 4300.0, + "end": 4360.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 4355.0, + "end": 4370.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4365.0, + "end": 4375.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4370.0, + "end": 4385.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4380.0, + "end": 4395.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 4390.0, + "end": 4400.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4395.0, + "end": 4405.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 4400.0, + "end": 4410.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 4405.0, + "end": 4430.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 4425.0, + "end": 4460.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 4455.0, + "end": 4500.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 4495.0, + "end": 4510.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4505.0, + "end": 4520.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4515.0, + "end": 4535.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 4530.0, + "end": 4540.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 4535.0, + "end": 4550.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4545.0, + "end": 4570.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 4565.0, + "end": 4575.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4570.0, + "end": 4595.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 4590.0, + "end": 4605.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4600.0, + "end": 4610.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 4605.0, + "end": 4640.0, + "confidence": 0.64 + }, + { + "speaker": "HOST", + "start": 4635.0, + "end": 5045.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 5040.0, + "end": 5050.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 5045.0, + "end": 5365.0, + "confidence": 0.86 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e17/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e17/diarization.json new file mode 100644 index 0000000..d8b6c6b --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e17/diarization.json @@ -0,0 +1,303 @@ +{ + "num_speakers": 2, + "speaker_map": { + "CALLER": "CALLER", + "HOST": "HOST" + }, + "turns": [ + { + "speaker": "HOST", + "start": 0.0, + "end": 20.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 15.0, + "end": 25.0, + "confidence": 0.64 + }, + { + "speaker": "HOST", + "start": 20.0, + "end": 140.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 135.0, + "end": 145.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 140.0, + "end": 760.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 755.0, + "end": 795.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 790.0, + "end": 1425.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1420.0, + "end": 1470.0, + "confidence": 0.71 + }, + { + "speaker": "HOST", + "start": 1465.0, + "end": 2290.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 2285.0, + "end": 2335.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 2330.0, + "end": 2345.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 2340.0, + "end": 2390.0, + "confidence": 0.56 + }, + { + "speaker": "HOST", + "start": 2385.0, + "end": 2405.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 2400.0, + "end": 2420.0, + "confidence": 0.57 + }, + { + "speaker": "HOST", + "start": 2415.0, + "end": 2535.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 2530.0, + "end": 2540.0, + "confidence": 0.49 + }, + { + "speaker": "HOST", + "start": 2535.0, + "end": 2555.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 2550.0, + "end": 2565.0, + "confidence": 0.48 + }, + { + "speaker": "HOST", + "start": 2560.0, + "end": 2605.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 2600.0, + "end": 2610.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2605.0, + "end": 2625.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 2620.0, + "end": 2630.0, + "confidence": 0.76 + }, + { + "speaker": "HOST", + "start": 2625.0, + "end": 2675.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 2670.0, + "end": 2720.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 2715.0, + "end": 2845.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 2840.0, + "end": 2930.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2925.0, + "end": 2955.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 2950.0, + "end": 3005.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 3000.0, + "end": 3060.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 3055.0, + "end": 3065.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3060.0, + "end": 3075.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 3070.0, + "end": 3110.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3105.0, + "end": 3165.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 3160.0, + "end": 3180.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3175.0, + "end": 3390.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 3385.0, + "end": 3425.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 3420.0, + "end": 3610.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3605.0, + "end": 3625.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 3620.0, + "end": 3630.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 3625.0, + "end": 3685.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 3680.0, + "end": 3700.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 3695.0, + "end": 3710.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 3705.0, + "end": 4150.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 4145.0, + "end": 4180.0, + "confidence": 0.69 + }, + { + "speaker": "HOST", + "start": 4175.0, + "end": 4615.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 4610.0, + "end": 4620.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 4615.0, + "end": 4835.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 4830.0, + "end": 4885.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 4880.0, + "end": 5300.0, + "confidence": 0.94 + } + ] +} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e21/diarization.json b/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e21/diarization.json new file mode 100644 index 0000000..aacd699 --- /dev/null +++ b/projects/radio-show/audio-processor/training-data/transcripts/2018-s10e21/diarization.json @@ -0,0 +1,669 @@ +{ + "num_speakers": 2, + "speaker_map": { + "CALLER": "CALLER", + "HOST": "HOST" + }, + "turns": [ + { + "speaker": "CALLER", + "start": 0.0, + "end": 10.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 5.0, + "end": 20.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 15.0, + "end": 25.0, + "confidence": 0.63 + }, + { + "speaker": "HOST", + "start": 20.0, + "end": 165.0, + "confidence": 0.99 + }, + { + "speaker": "CALLER", + "start": 160.0, + "end": 170.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 165.0, + "end": 240.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 235.0, + "end": 250.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 245.0, + "end": 275.0, + "confidence": 0.92 + }, + { + "speaker": "CALLER", + "start": 270.0, + "end": 290.0, + "confidence": 0.65 + }, + { + "speaker": "HOST", + "start": 285.0, + "end": 295.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 290.0, + "end": 305.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 300.0, + "end": 320.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 315.0, + "end": 345.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 340.0, + "end": 350.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 345.0, + "end": 365.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 360.0, + "end": 545.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 540.0, + "end": 555.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 550.0, + "end": 570.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 565.0, + "end": 585.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 580.0, + "end": 590.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 585.0, + "end": 595.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 590.0, + "end": 625.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 620.0, + "end": 630.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 625.0, + "end": 635.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 630.0, + "end": 650.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 645.0, + "end": 700.0, + "confidence": 0.94 + }, + { + "speaker": "CALLER", + "start": 695.0, + "end": 745.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 740.0, + "end": 780.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 775.0, + "end": 785.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 780.0, + "end": 790.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 785.0, + "end": 825.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 820.0, + "end": 830.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 825.0, + "end": 860.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 855.0, + "end": 870.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 865.0, + "end": 895.0, + "confidence": 0.82 + }, + { + "speaker": "HOST", + "start": 890.0, + "end": 1060.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 1055.0, + "end": 1065.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 1060.0, + "end": 1115.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 1110.0, + "end": 1120.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 1115.0, + "end": 1240.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 1235.0, + "end": 1250.0, + "confidence": 0.79 + }, + { + "speaker": "HOST", + "start": 1245.0, + "end": 1255.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 1250.0, + "end": 1260.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1255.0, + "end": 1270.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 1265.0, + "end": 1275.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 1270.0, + "end": 1340.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1335.0, + "end": 1345.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1340.0, + "end": 1350.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1345.0, + "end": 1355.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 1350.0, + "end": 1435.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 1430.0, + "end": 1450.0, + "confidence": 0.61 + }, + { + "speaker": "HOST", + "start": 1445.0, + "end": 1595.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 1590.0, + "end": 1605.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1600.0, + "end": 1610.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 1605.0, + "end": 1620.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1615.0, + "end": 1635.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1630.0, + "end": 1640.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 1635.0, + "end": 1650.0, + "confidence": 0.86 + }, + { + "speaker": "CALLER", + "start": 1645.0, + "end": 1675.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1670.0, + "end": 1875.0, + "confidence": 0.9 + }, + { + "speaker": "CALLER", + "start": 1870.0, + "end": 1890.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 1885.0, + "end": 2020.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 2015.0, + "end": 2030.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 2025.0, + "end": 2035.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 2030.0, + "end": 2040.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2035.0, + "end": 2110.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 2105.0, + "end": 2115.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 2110.0, + "end": 2135.0, + "confidence": 0.89 + }, + { + "speaker": "CALLER", + "start": 2130.0, + "end": 2195.0, + "confidence": 0.71 + }, + { + "speaker": "HOST", + "start": 2190.0, + "end": 2375.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2370.0, + "end": 2385.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 2380.0, + "end": 2670.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 2665.0, + "end": 2675.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 2670.0, + "end": 2690.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 2685.0, + "end": 2695.0, + "confidence": 0.7 + }, + { + "speaker": "HOST", + "start": 2690.0, + "end": 2855.0, + "confidence": 0.98 + }, + { + "speaker": "CALLER", + "start": 2850.0, + "end": 2965.0, + "confidence": 0.75 + }, + { + "speaker": "HOST", + "start": 2960.0, + "end": 3025.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3020.0, + "end": 3040.0, + "confidence": 0.77 + }, + { + "speaker": "HOST", + "start": 3035.0, + "end": 3045.0, + "confidence": 0.85 + }, + { + "speaker": "CALLER", + "start": 3040.0, + "end": 3050.0, + "confidence": 0.77 + }, + { + "speaker": "HOST", + "start": 3045.0, + "end": 3060.0, + "confidence": 0.91 + }, + { + "speaker": "CALLER", + "start": 3055.0, + "end": 3100.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 3095.0, + "end": 3180.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3175.0, + "end": 3195.0, + "confidence": 0.77 + }, + { + "speaker": "HOST", + "start": 3190.0, + "end": 3215.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3210.0, + "end": 3230.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 3225.0, + "end": 3240.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3235.0, + "end": 3250.0, + "confidence": 0.8 + }, + { + "speaker": "HOST", + "start": 3245.0, + "end": 3270.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 3265.0, + "end": 3305.0, + "confidence": 0.78 + }, + { + "speaker": "HOST", + "start": 3300.0, + "end": 3675.0, + "confidence": 0.87 + }, + { + "speaker": "CALLER", + "start": 3670.0, + "end": 3680.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 3675.0, + "end": 3875.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 3870.0, + "end": 3905.0, + "confidence": 0.85 + }, + { + "speaker": "HOST", + "start": 3900.0, + "end": 3990.0, + "confidence": 0.95 + }, + { + "speaker": "CALLER", + "start": 3985.0, + "end": 4000.0, + "confidence": 0.73 + }, + { + "speaker": "HOST", + "start": 3995.0, + "end": 4145.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4140.0, + "end": 4165.0, + "confidence": 0.7 + }, + { + "speaker": "HOST", + "start": 4160.0, + "end": 4180.0, + "confidence": 0.96 + }, + { + "speaker": "CALLER", + "start": 4175.0, + "end": 4205.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4200.0, + "end": 4500.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 4495.0, + "end": 4505.0, + "confidence": 0.83 + }, + { + "speaker": "HOST", + "start": 4500.0, + "end": 4600.0, + "confidence": 0.88 + }, + { + "speaker": "CALLER", + "start": 4595.0, + "end": 4605.0, + "confidence": 0.84 + }, + { + "speaker": "HOST", + "start": 4600.0, + "end": 5040.0, + "confidence": 0.97 + }, + { + "speaker": "CALLER", + "start": 5035.0, + "end": 5045.0, + "confidence": 0.81 + }, + { + "speaker": "HOST", + "start": 5040.0, + "end": 5055.0, + "confidence": 0.93 + }, + { + "speaker": "CALLER", + "start": 5050.0, + "end": 5105.0, + "confidence": 0.7 + }, + { + "speaker": "HOST", + "start": 5100.0, + "end": 5300.0, + "confidence": 0.98 + } + ] +} \ No newline at end of file