From c760e430c08e477c5dbffcb28fbc93d58549566b Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Mon, 27 Apr 2026 16:17:50 -0700 Subject: [PATCH] radio: bumper detection in diarizer + full archive download script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a transcript-driven bumper filter to the diarization pipeline. When a transcript segment matches qa_extractor's promo/bumper signatures, the overlapping audio windows are labeled BUMPER and the WavLM cosine match is skipped. Prevents music/promo from being matched against speaker profiles (the failure mode Mike caught in 2018-s10e18 @ 09:20-10:05). Code changes: - src/voice_profiler.py: identify_speakers() takes optional skip_ranges parameter; windows whose midpoint falls in a skip range get labeled "[bumper]" and skip cosine match - src/diarizer.py: diarize() takes optional transcript_path; pre-computes bumper time ranges via qa_extractor._is_promo_or_bumper, passes to identify_speakers; adds BUMPER speaker label - benchmark.py: passes transcript_path to diarize() Aggregate impact across 9-episode test set: Tara attribution: 4880s -> 3680s (-1200s / -25%) Q&A pairs: 17 -> 19 (+2) (bumper-flagged segments had been disrupting conversation detection in 2017-s9e30 and 2018-s10e18) CALLER total: 1320s -> 1190s (bumpers previously labeled CALLER moved) Per-episode bumpers caught: 1-8, total ~165 bumper segments across set Remaining Tara false positives are real callers acoustically similar to Tara (Christopher in 2018, Kay in 2012, William and Charles in 2015) and guest Clay in 2015-s7e19 — those need profile rebuild + Clay profile, not bumper filtering. Adds download_full_archive.py — resumable mirror-style downloader that walks IX server's /home/gurushow/public_html/archive/{year}/ and copies all MP3s to archive-data/episodes/. Run is in progress (~589 files, ~10-15GB). Used to source clean profile windows for the remaining co-hosts (Tara rebuild, Clay, Tony, Rob, Randall, producers). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../memory/radio_show_no_cohost_named_tom.md | 34 +++++- .../radio-show/audio-processor/benchmark.py | 3 +- .../audio-processor/download_full_archive.py | 103 ++++++++++++++++++ .../audio-processor/src/diarizer.py | 27 ++++- .../audio-processor/src/voice_profiler.py | 21 +++- .../2011-03-12-hr1/diarization.json | 21 ++-- .../2012-03-10-hr1/diarization.json | 29 ++--- .../2012-06-09-hr1/diarization.json | 35 +++--- .../transcripts/2014-s6e19/diarization.json | 49 ++++++--- .../transcripts/2016-s8e43/diarization.json | 81 +++++++++----- .../transcripts/2017-s9e30/diarization.json | 97 ++++++++++------- 11 files changed, 361 insertions(+), 139 deletions(-) create mode 100644 projects/radio-show/audio-processor/download_full_archive.py diff --git a/.claude/memory/radio_show_no_cohost_named_tom.md b/.claude/memory/radio_show_no_cohost_named_tom.md index b71217b..0e2b2d1 100644 --- a/.claude/memory/radio_show_no_cohost_named_tom.md +++ b/.claude/memory/radio_show_no_cohost_named_tom.md @@ -15,7 +15,38 @@ The show has had multiple **co-hosts** rotating through, plus **producers / boar |---|---|---|---| | **Randall** | early years | not yet | no | | **Rob** | early years + appearances in 2018/2019 (Mike unsure of exact dates) | not yet | no | -| **Tara** | confirmed 2014-s6e19, 2016-s8e43; diarizer also found her in 2017-s9e30 (610s/11.4%) — pending Mike spot-check | yes | yes — `voice-profiles/tara/` (44 embeddings) | +| **Tony** | 2012-era co-host (Mike unsure whether on-air in 2012-06-09-hr1) | not yet | no | +| **Tara** | confirmed 2014-s6e19, 2016-s8e43, **2018-s10e18 @ 50:50** (verified by Mike 2026-04-27 listen). Plausible in 2015 and 2017 (pending verify). | yes | yes — `voice-profiles/tara/` (44 embeddings, **possibly contaminated**, see below) | + +### Tara profile contamination flag + +Mike spot-checked CO-HOST-flagged windows on 2026-04-27 and found the diarizer matching: + +In **2018-s10e18**: +- **A bumper** (09:20-10:05, music/promo — not a voice) +- **Tara** (50:50 — true positive) +- **A caller, "Christopher"** (~82:10 — false positive, real caller misattributed as Tara) + +In **2012-06-09-hr1**: +- **A caller, "Kay"** (22:10-26:00 — real caller misattributed as Tara). Spans the 22:25-24:30 (125s) and 25:15-25:55 (40s) CO-HOST turns. Mike unsure whether co-host Tony was on-air this episode. + +In **2015-s7e19** (Jan 2015 New Year episode): +- **A caller, "William"** (~35:30 — confirmed in transcript: "let's talk to William. Hello, William. How are you?", asks about Excel→Word mail merge) +- **A caller, "Charles"** (~16:30 — Mike-identified, transcript not yet verified) +- **A recurring special guest, "Clay" from "Nerd Junkies"** — appears multiple times: transcript at 33:13 "More Clay from the Nerd Junkies", at 37:33 "I'm just curious, Clay, do you have any feedback". Clay is a recurring guest, not a co-host. The 4:40 of "Tara"-attributed audio in this episode is likely **all** Clay + callers, with no actual Tara presence. + +### Recurring guests / fill-ins +| Person | Affiliation | Confirmed in audio | Profile built | +|---|---|---|---| +| **Clay** | "Nerd Junkies" — fills in for Tara when she's out (Mike: rarely appears in other episodes) | 2015-s7e19 (throughout — Tara was out, Clay covered) | pending | + +Tara's role is explicit per transcript at 2015-s7e19 @ 00:51: "in Tara's place, we have Clay. Clay from the Nerd Junkies." — Tara is the regular co-host for that era; Clay is a fill-in. + +Root cause is likely contamination in `build_cohost_profile.py`: the TARA_WINDOWS were sourced from "first 60 min CALLER turns" under the assumption "real callers don't call in during the first hour of a 2-hour show." That assumption appears to leak — at least one real caller ended up in Tara's training data, and the resulting profile now matches a too-broad acoustic space. + +Two distinct fixes needed: +1. **Bumper handling in diarizer** — the qa_extractor has bumper signature detection but the diarizer doesn't filter music/promo segments before speaker matching. Bumpers with vocal content can trigger speaker matches. +2. **Tara profile rebuild from vetted windows** — Mike-confirmed windows only, not the heuristic-selected first-60-min approach. The 2026-04-27 listen confirmed 50:50 in 2018-s10e18 as a clean Tara window; more would be needed. ### Producers / board ops (sometimes on-air) | Person | Profile built | @@ -23,6 +54,7 @@ The show has had multiple **co-hosts** rotating through, plus **producers / boar | **Andrew** | no | | **Shannon** | no | | **Ken** | no | +| **Unknown board op (2015-s7e19 opening)** | no — Mike heard him at the very start of 2015-s7e19, name forgotten | | (Mike: "a couple more" he doesn't recall off-hand) | no | Mike: "The 'producer' (board op) would also be on-air sometimes." Anywhere a producer's voice appears, they're currently being labeled CALLER, which inflates Q&A false positives. Same problem as unprofiled co-hosts. diff --git a/projects/radio-show/audio-processor/benchmark.py b/projects/radio-show/audio-processor/benchmark.py index e44bf94..4ad30c3 100644 --- a/projects/radio-show/audio-processor/benchmark.py +++ b/projects/radio-show/audio-processor/benchmark.py @@ -108,7 +108,8 @@ for ep, transcript_path, audio_dur, _ in trans_results: audio_dur = json.load(f).get("duration", 0) t0 = time.monotonic() - result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85) + result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85, + transcript_path=transcript_path) wall = time.monotonic() - t0 rtf = audio_dur / wall if wall > 0 else 0 diff --git a/projects/radio-show/audio-processor/download_full_archive.py b/projects/radio-show/audio-processor/download_full_archive.py new file mode 100644 index 0000000..42fa53e --- /dev/null +++ b/projects/radio-show/audio-processor/download_full_archive.py @@ -0,0 +1,103 @@ +""" +Download the full Computer Guru Show archive from IX server (172.16.3.10). + +Mirrors the year-based directory structure as-is to archive-data/episodes/. +Resumable: skips files already present with matching size. +Requires Tailscale. +""" +import os +import sys +import time +import paramiko +from pathlib import Path + +password = os.environ.get("IX_PASSWORD") +if not password: + print("IX_PASSWORD env var not set", file=sys.stderr) + sys.exit(1) + +LOCAL_ROOT = Path(__file__).parent / "archive-data" / "episodes" +LOCAL_ROOT.mkdir(parents=True, exist_ok=True) + +REMOTE_ROOT = "/home/gurushow/public_html/archive" +YEARS = ["2010", "2011", "2012", "2014", "2015", "2016", "2017", "2018"] + +print(f"Connecting to 172.16.3.10...", flush=True) +client = paramiko.SSHClient() +client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +client.connect("172.16.3.10", username="root", password=password, + look_for_keys=False, allow_agent=False, timeout=30) +sftp = client.open_sftp() +print("Connected.", flush=True) + + +def list_remote_mp3s(year: str) -> list[str]: + cmd = f"find '{REMOTE_ROOT}/{year}' -iname '*.mp3' 2>/dev/null" + stdin, stdout, stderr = client.exec_command(cmd) + return [line.strip() for line in stdout.read().decode().splitlines() if line.strip()] + + +total_files = 0 +total_bytes = 0 +skipped_files = 0 +skipped_bytes = 0 +downloaded_files = 0 +downloaded_bytes = 0 +errors = [] + +t_start = time.monotonic() + +for year in YEARS: + print(f"\n=== {year} ===", flush=True) + remote_paths = list_remote_mp3s(year) + print(f" {len(remote_paths)} MP3 files found on remote", flush=True) + + for remote in remote_paths: + rel = remote[len(REMOTE_ROOT) + 1:] + local = LOCAL_ROOT / rel + local.parent.mkdir(parents=True, exist_ok=True) + + try: + remote_stat = sftp.stat(remote) + remote_size = remote_stat.st_size + except Exception as e: + errors.append(f"stat {remote}: {e}") + continue + + total_files += 1 + total_bytes += remote_size + + if local.exists() and local.stat().st_size == remote_size: + skipped_files += 1 + skipped_bytes += remote_size + continue + + size_mb = remote_size / 1024 / 1024 + print(f" [{downloaded_files + 1:3d}] {rel} ({size_mb:.1f} MB)...", end="", flush=True) + t0 = time.monotonic() + try: + sftp.get(remote, str(local)) + elapsed = time.monotonic() - t0 + mbps = size_mb / elapsed if elapsed > 0 else 0 + print(f" done ({elapsed:.1f}s, {mbps:.1f} MB/s)", flush=True) + downloaded_files += 1 + downloaded_bytes += remote_size + except Exception as e: + print(f" FAILED: {e}", flush=True) + errors.append(f"get {remote}: {e}") + +elapsed_total = time.monotonic() - t_start +print(f"\n=== Summary ===", flush=True) +print(f" Total remote files : {total_files}", flush=True) +print(f" Total remote bytes : {total_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True) +print(f" Already present : {skipped_files} files / {skipped_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True) +print(f" Newly downloaded : {downloaded_files} files / {downloaded_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True) +print(f" Errors : {len(errors)}", flush=True) +print(f" Wall time : {elapsed_total:.1f}s", flush=True) +if errors: + print(f"\n=== Errors ===", flush=True) + for e in errors[:20]: + print(f" {e}", flush=True) + +sftp.close() +client.close() diff --git a/projects/radio-show/audio-processor/src/diarizer.py b/projects/radio-show/audio-processor/src/diarizer.py index 340ea4d..f393fd1 100644 --- a/projects/radio-show/audio-processor/src/diarizer.py +++ b/projects/radio-show/audio-processor/src/diarizer.py @@ -158,12 +158,17 @@ def diarize(audio_path: str | Path, voice_profiles: VoiceProfileStore | None = None, min_speakers: int = 1, max_speakers: int = 6, - host_match_threshold: float = 0.85) -> DiarizationResult: + host_match_threshold: float = 0.85, + transcript_path: str | Path | None = None) -> DiarizationResult: """Run speaker diarization using WavLM sliding-window speaker identification. Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token or gated model required. Identifies HOST vs non-HOST speakers using the stored voice profile for Mike Swanson. + + If transcript_path is provided, time ranges containing show promo/bumper + text are pre-marked and skipped at speaker-identification time so vocal + music doesn't match cohost profiles. """ import torch from .voice_profiler import VoiceProfiler @@ -190,10 +195,28 @@ def diarize(audio_path: str | Path, speaker_map={"HOST": "HOST"}, ) + # Pre-compute bumper / promo time ranges from transcript if available + bumper_ranges: list[tuple[float, float]] = [] + if transcript_path is not None: + transcript_path = Path(transcript_path) + if transcript_path.exists(): + from .qa_extractor import _is_promo_or_bumper + with open(transcript_path) as f: + tdata = json.load(f) + for seg in tdata.get("segments", []): + if _is_promo_or_bumper(seg.get("text", "")): + bumper_ranges.append((seg["start"], seg["end"])) + if bumper_ranges: + console.print( + f"[dim]Bumper filter: {len(bumper_ranges)} promo/bumper " + f"transcript segments will be skipped during speaker match[/dim]" + ) + # Sliding-window identification: 10s windows, 5s hop voice_segs = profiler.identify_speakers( audio_path, window_s=10.0, hop_s=5.0, threshold=host_match_threshold, + skip_ranges=bumper_ranges, ) # Convert VoiceSegment labels to HOST / CALLER @@ -204,6 +227,8 @@ def diarize(audio_path: str | Path, speaker = "HOST" elif label.startswith("Cohost:"): speaker = "CO-HOST" + elif label == "[bumper]": + speaker = "BUMPER" elif label == "[error]": speaker = "UNKNOWN" else: diff --git a/projects/radio-show/audio-processor/src/voice_profiler.py b/projects/radio-show/audio-processor/src/voice_profiler.py index 7fe327a..6cecc4d 100644 --- a/projects/radio-show/audio-processor/src/voice_profiler.py +++ b/projects/radio-show/audio-processor/src/voice_profiler.py @@ -279,12 +279,19 @@ class VoiceProfiler: def identify_speakers(self, audio_path: Path, window_s: float = 10.0, hop_s: float = 5.0, - threshold: float = 0.70) -> list[VoiceSegment]: + threshold: float = 0.70, + skip_ranges: list[tuple[float, float]] | None = None + ) -> list[VoiceSegment]: """Identify speakers throughout an audio file using sliding window. Loads the full audio once then slices in memory — avoids spawning hundreds of ffmpeg subprocesses. Returns timestamped segments with speaker labels and embeddings. + + skip_ranges: list of (start, end) seconds. Windows whose midpoint + falls inside any of these ranges are labeled "[bumper]" and the + speaker cosine match is skipped — used to suppress music/promo + from being matched against speaker profiles. """ console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}") @@ -293,6 +300,8 @@ class VoiceProfiler: audio = self._load_full_audio(audio_path) # float32 mono array self._get_model() # ensure model is warm before the loop + skip_ranges = skip_ranges or [] + segments = [] window_samples = int(window_s * SAMPLE_RATE) hop_samples = int(hop_s * SAMPLE_RATE) @@ -306,6 +315,16 @@ class VoiceProfiler: s = int(start * SAMPLE_RATE) e = min(s + window_samples, total_samples) + mid = (start + end) / 2 + in_bumper = any(rs <= mid <= re for rs, re in skip_ranges) + + if in_bumper: + segments.append(VoiceSegment( + start=start, end=end, + speaker_label="[bumper] (1.00)", + )) + continue + try: emb = self._embed_audio_np(audio[s:e]) diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json index 63cb04b..7d90060 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json @@ -1,26 +1,21 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST" + "CO-HOST": "CO-HOST", + "BUMPER": "BUMPER", + "CALLER": "CALLER" }, "turns": [ { - "speaker": "HOST", + "speaker": "BUMPER", "start": 0.0, - "end": 20.0, - "confidence": 0.89 - }, - { - "speaker": "CO-HOST", - "start": 15.0, - "end": 25.0, - "confidence": 0.87 + "end": 35.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 20.0, + "start": 30.0, "end": 40.0, "confidence": 0.88 }, diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json index b69e922..8e28edb 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json @@ -1,34 +1,23 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST" + "CO-HOST": "CO-HOST", + "BUMPER": "BUMPER", + "CALLER": "CALLER" }, "turns": [ { - "speaker": "HOST", + "speaker": "BUMPER", "start": 0.0, - "end": 20.0, - "confidence": 0.88 - }, - { - "speaker": "CO-HOST", - "start": 15.0, - "end": 25.0, - "confidence": 0.87 - }, - { - "speaker": "CALLER", - "start": 20.0, - "end": 30.0, - "confidence": 0.84 + "end": 35.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 25.0, + "start": 30.0, "end": 430.0, - "confidence": 0.86 + "confidence": 0.96 }, { "speaker": "CALLER", diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json index f5ea9df..88b2fcd 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json @@ -1,28 +1,23 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST" + "CO-HOST": "CO-HOST", + "BUMPER": "BUMPER", + "CALLER": "CALLER" }, "turns": [ { - "speaker": "HOST", + "speaker": "BUMPER", "start": 0.0, - "end": 20.0, - "confidence": 0.9 - }, - { - "speaker": "CO-HOST", - "start": 15.0, - "end": 25.0, - "confidence": 0.87 + "end": 35.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 20.0, + "start": 30.0, "end": 690.0, - "confidence": 0.86 + "confidence": 0.97 }, { "speaker": "CALLER", @@ -33,14 +28,20 @@ { "speaker": "HOST", "start": 690.0, - "end": 1350.0, + "end": 1330.0, "confidence": 0.92 }, + { + "speaker": "BUMPER", + "start": 1325.0, + "end": 1355.0, + "confidence": 1.0 + }, { "speaker": "CO-HOST", - "start": 1345.0, + "start": 1350.0, "end": 1470.0, - "confidence": 0.92 + "confidence": 0.93 }, { "speaker": "HOST", diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json index 023b81b..4458c75 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json @@ -1,16 +1,23 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", + "CALLER": "CALLER", + "BUMPER": "BUMPER", "CO-HOST": "CO-HOST" }, "turns": [ { - "speaker": "CO-HOST", + "speaker": "BUMPER", "start": 0.0, + "end": 35.0, + "confidence": 1.0 + }, + { + "speaker": "CO-HOST", + "start": 30.0, "end": 40.0, - "confidence": 0.96 + "confidence": 0.93 }, { "speaker": "HOST", @@ -61,22 +68,28 @@ "confidence": 0.96 }, { - "speaker": "HOST", + "speaker": "BUMPER", "start": 660.0, - "end": 680.0, - "confidence": 0.98 + "end": 695.0, + "confidence": 1.0 }, { "speaker": "CO-HOST", - "start": 675.0, - "end": 710.0, - "confidence": 0.94 + "start": 690.0, + "end": 700.0, + "confidence": 0.95 + }, + { + "speaker": "BUMPER", + "start": 695.0, + "end": 740.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 705.0, + "start": 735.0, "end": 985.0, - "confidence": 0.9 + "confidence": 0.87 }, { "speaker": "CO-HOST", @@ -159,18 +172,18 @@ { "speaker": "HOST", "start": 2055.0, - "end": 2155.0, + "end": 2120.0, "confidence": 0.94 }, { - "speaker": "CALLER", - "start": 2150.0, - "end": 2160.0, - "confidence": 0.83 + "speaker": "BUMPER", + "start": 2115.0, + "end": 2165.0, + "confidence": 1.0 }, { "speaker": "CO-HOST", - "start": 2155.0, + "start": 2160.0, "end": 2170.0, "confidence": 0.97 }, diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json index 4fdcac2..d1d2bc4 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json @@ -1,14 +1,21 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", + "CALLER": "CALLER", + "BUMPER": "BUMPER", "CO-HOST": "CO-HOST" }, "turns": [ { - "speaker": "CO-HOST", + "speaker": "BUMPER", "start": 0.0, + "end": 35.0, + "confidence": 1.0 + }, + { + "speaker": "CO-HOST", + "start": 30.0, "end": 40.0, "confidence": 0.96 }, @@ -123,14 +130,20 @@ { "speaker": "HOST", "start": 550.0, - "end": 580.0, + "end": 565.0, "confidence": 0.98 }, + { + "speaker": "BUMPER", + "start": 560.0, + "end": 595.0, + "confidence": 1.0 + }, { "speaker": "CO-HOST", - "start": 575.0, + "start": 590.0, "end": 600.0, - "confidence": 0.96 + "confidence": 0.93 }, { "speaker": "CALLER", @@ -153,12 +166,18 @@ { "speaker": "HOST", "start": 1055.0, - "end": 1190.0, + "end": 1160.0, "confidence": 0.99 }, + { + "speaker": "BUMPER", + "start": 1155.0, + "end": 1205.0, + "confidence": 1.0 + }, { "speaker": "CO-HOST", - "start": 1185.0, + "start": 1200.0, "end": 1215.0, "confidence": 0.98 }, @@ -255,12 +274,18 @@ { "speaker": "CO-HOST", "start": 2020.0, - "end": 2055.0, + "end": 2030.0, "confidence": 0.92 }, + { + "speaker": "BUMPER", + "start": 2025.0, + "end": 2060.0, + "confidence": 1.0 + }, { "speaker": "HOST", - "start": 2050.0, + "start": 2055.0, "end": 2105.0, "confidence": 0.98 }, @@ -549,14 +574,14 @@ { "speaker": "HOST", "start": 3370.0, - "end": 3395.0, + "end": 3390.0, "confidence": 0.94 }, { - "speaker": "CO-HOST", - "start": 3390.0, + "speaker": "BUMPER", + "start": 3385.0, "end": 3435.0, - "confidence": 0.85 + "confidence": 1.0 }, { "speaker": "HOST", @@ -565,22 +590,16 @@ "confidence": 0.98 }, { - "speaker": "CO-HOST", + "speaker": "BUMPER", "start": 3965.0, - "end": 3980.0, - "confidence": 0.96 - }, - { - "speaker": "HOST", - "start": 3975.0, - "end": 3990.0, - "confidence": 0.97 + "end": 4020.0, + "confidence": 1.0 }, { "speaker": "CO-HOST", - "start": 3985.0, + "start": 4015.0, "end": 4025.0, - "confidence": 0.86 + "confidence": 0.95 }, { "speaker": "CALLER", @@ -723,14 +742,20 @@ { "speaker": "HOST", "start": 4575.0, - "end": 4680.0, + "end": 4655.0, "confidence": 0.97 }, + { + "speaker": "BUMPER", + "start": 4650.0, + "end": 4695.0, + "confidence": 1.0 + }, { "speaker": "CO-HOST", - "start": 4675.0, + "start": 4690.0, "end": 4715.0, - "confidence": 0.92 + "confidence": 0.94 }, { "speaker": "HOST", diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json index b655ed2..d9e0bce 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json @@ -1,22 +1,17 @@ { - "num_speakers": 3, + "num_speakers": 4, "speaker_map": { - "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST" + "CO-HOST": "CO-HOST", + "BUMPER": "BUMPER", + "CALLER": "CALLER" }, "turns": [ { - "speaker": "HOST", + "speaker": "BUMPER", "start": 0.0, - "end": 20.0, - "confidence": 0.88 - }, - { - "speaker": "CO-HOST", - "start": 15.0, "end": 25.0, - "confidence": 0.92 + "confidence": 1.0 }, { "speaker": "HOST", @@ -69,14 +64,20 @@ { "speaker": "HOST", "start": 615.0, - "end": 730.0, + "end": 710.0, "confidence": 0.89 }, + { + "speaker": "BUMPER", + "start": 705.0, + "end": 750.0, + "confidence": 1.0 + }, { "speaker": "CO-HOST", - "start": 725.0, + "start": 745.0, "end": 770.0, - "confidence": 0.91 + "confidence": 0.96 }, { "speaker": "HOST", @@ -117,9 +118,21 @@ { "speaker": "CO-HOST", "start": 1310.0, - "end": 1355.0, + "end": 1320.0, "confidence": 0.98 }, + { + "speaker": "BUMPER", + "start": 1315.0, + "end": 1350.0, + "confidence": 1.0 + }, + { + "speaker": "CO-HOST", + "start": 1345.0, + "end": 1355.0, + "confidence": 0.97 + }, { "speaker": "HOST", "start": 1350.0, @@ -189,20 +202,20 @@ { "speaker": "HOST", "start": 1460.0, - "end": 2130.0, + "end": 2110.0, "confidence": 0.88 }, { - "speaker": "CALLER", - "start": 2125.0, - "end": 2135.0, - "confidence": 0.78 + "speaker": "BUMPER", + "start": 2105.0, + "end": 2155.0, + "confidence": 1.0 }, { "speaker": "CO-HOST", - "start": 2130.0, + "start": 2150.0, "end": 2175.0, - "confidence": 0.86 + "confidence": 0.89 }, { "speaker": "HOST", @@ -219,20 +232,20 @@ { "speaker": "HOST", "start": 2650.0, - "end": 2725.0, + "end": 2715.0, "confidence": 0.97 }, { - "speaker": "CO-HOST", - "start": 2720.0, - "end": 2730.0, - "confidence": 0.89 + "speaker": "BUMPER", + "start": 2710.0, + "end": 2745.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 2725.0, + "start": 2740.0, "end": 2995.0, - "confidence": 0.91 + "confidence": 0.99 }, { "speaker": "CO-HOST", @@ -273,20 +286,20 @@ { "speaker": "CO-HOST", "start": 3375.0, - "end": 3410.0, + "end": 3390.0, "confidence": 0.91 }, { - "speaker": "CALLER", - "start": 3405.0, - "end": 3415.0, - "confidence": 0.84 + "speaker": "BUMPER", + "start": 3385.0, + "end": 3425.0, + "confidence": 1.0 }, { "speaker": "HOST", - "start": 3410.0, + "start": 3420.0, "end": 4185.0, - "confidence": 0.96 + "confidence": 0.98 }, { "speaker": "CALLER", @@ -387,14 +400,20 @@ { "speaker": "CO-HOST", "start": 4550.0, - "end": 4595.0, + "end": 4565.0, "confidence": 0.89 }, + { + "speaker": "BUMPER", + "start": 4560.0, + "end": 4605.0, + "confidence": 1.0 + }, { "speaker": "HOST", - "start": 4590.0, + "start": 4600.0, "end": 5285.0, - "confidence": 0.95 + "confidence": 0.94 }, { "speaker": "CO-HOST",