- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
"""
|
|
Quick diagnostic: print per-window WavLM similarity scores for one episode.
|
|
Run before diarize_training.py to understand score distribution.
|
|
"""
|
|
import sys
|
|
import os
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
import numpy as np
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
from src.voice_profiler import VoiceProfiler
|
|
from src.config import load_config
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
BASE = Path(__file__).parent
|
|
config = load_config()
|
|
profiles_dir = config.resolve_path(config.diarization.voice_profiles_dir)
|
|
|
|
import torch
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
console.print(f"Device: {device}")
|
|
|
|
profiler = VoiceProfiler(profiles_dir, device=device)
|
|
|
|
if not profiler.profiles:
|
|
console.print("[red]No voice profiles loaded[/red]")
|
|
sys.exit(1)
|
|
|
|
# Use the first available episode
|
|
episodes = sorted((BASE / "training-data" / "episodes").glob("*.mp3"))
|
|
if not episodes:
|
|
console.print("[red]No episodes found[/red]")
|
|
sys.exit(1)
|
|
|
|
ep = episodes[0]
|
|
console.print(f"\nAnalyzing first 20 minutes of: {ep.name}")
|
|
console.print("Format: [time] similarity_score label\n")
|
|
|
|
duration = profiler._get_duration(ep)
|
|
# Scan 10-40 minutes — intro monologue usually ends before 10 min, callers appear after
|
|
scan_start = min(600.0, duration * 0.15) # ~10 min in or 15%
|
|
scan_end = min(duration, 2400.0) # up to 40 min
|
|
|
|
window_s = 10.0
|
|
hop_s = 30.0 # coarse pass — one window per 30s for speed
|
|
|
|
scores = []
|
|
for start in np.arange(scan_start, scan_end - window_s, hop_s):
|
|
end = start + window_s
|
|
try:
|
|
emb = profiler.extract_embedding(ep, start, end)
|
|
best_score = 0.0
|
|
best_name = ""
|
|
for name, profile in profiler.profiles.items():
|
|
s = profile.similarity(emb)
|
|
if s > best_score:
|
|
best_score = s
|
|
best_name = name
|
|
|
|
label = f"HOST ({best_name})" if best_score >= 0.85 else (
|
|
f"CALLER (below 0.85)" if best_score >= 0.70 else "UNKNOWN"
|
|
)
|
|
console.print(f" [{start:6.0f}s-{end:.0f}s] {best_score:.4f} {label}")
|
|
scores.append(best_score)
|
|
except Exception as e:
|
|
console.print(f" [{start:6.0f}s] ERROR: {e}")
|
|
|
|
if scores:
|
|
console.print(f"\nScore distribution over first 20 min:")
|
|
console.print(f" min={min(scores):.4f} max={max(scores):.4f} mean={np.mean(scores):.4f} median={np.median(scores):.4f}")
|
|
buckets = [0.0, 0.6, 0.7, 0.75, 0.80, 0.85, 0.90, 0.95, 1.01]
|
|
for lo, hi in zip(buckets, buckets[1:]):
|
|
count = sum(1 for s in scores if lo <= s < hi)
|
|
bar = "#" * count
|
|
console.print(f" [{lo:.2f}-{hi:.2f}): {count:3d} {bar}")
|