claudetools/projects/radio-show/audio-processor/check_scores.py

"""
Quick diagnostic: print per-window WavLM similarity scores for one episode.
Run before diarize_training.py to understand score distribution.
"""
import sys
import os

os.environ["PYTHONIOENCODING"] = "utf-8"
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")
os.environ["TRANSFORMERS_OFFLINE"] = "1"

from pathlib import Path
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()

from src.voice_profiler import VoiceProfiler
from src.config import load_config
from rich.console import Console

console = Console()

BASE = Path(__file__).parent
config = load_config()
profiles_dir = config.resolve_path(config.diarization.voice_profiles_dir)

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")

profiler = VoiceProfiler(profiles_dir, device=device)

if not profiler.profiles:
    console.print("[red]No voice profiles loaded[/red]")
    sys.exit(1)

# Use the first available episode
episodes = sorted((BASE / "training-data" / "episodes").glob("*.mp3"))
if not episodes:
    console.print("[red]No episodes found[/red]")
    sys.exit(1)

ep = episodes[0]
console.print(f"\nAnalyzing first 20 minutes of: {ep.name}")
console.print("Format: [time] similarity_score  label\n")

duration = profiler._get_duration(ep)
# Scan 10-40 minutes — intro monologue usually ends before 10 min, callers appear after
scan_start = min(600.0, duration * 0.15)   # ~10 min in or 15%
scan_end = min(duration, 2400.0)           # up to 40 min

window_s = 10.0
hop_s = 30.0  # coarse pass — one window per 30s for speed

scores = []
for start in np.arange(scan_start, scan_end - window_s, hop_s):
    end = start + window_s
    try:
        emb = profiler.extract_embedding(ep, start, end)
        best_score = 0.0
        best_name = ""
        for name, profile in profiler.profiles.items():
            s = profile.similarity(emb)
            if s > best_score:
                best_score = s
                best_name = name

        label = f"HOST ({best_name})" if best_score >= 0.85 else (
            f"CALLER (below 0.85)" if best_score >= 0.70 else "UNKNOWN"
        )
        console.print(f"  [{start:6.0f}s-{end:.0f}s]  {best_score:.4f}  {label}")
        scores.append(best_score)
    except Exception as e:
        console.print(f"  [{start:6.0f}s]  ERROR: {e}")

if scores:
    console.print(f"\nScore distribution over first 20 min:")
    console.print(f"  min={min(scores):.4f}  max={max(scores):.4f}  mean={np.mean(scores):.4f}  median={np.median(scores):.4f}")
    buckets = [0.0, 0.6, 0.7, 0.75, 0.80, 0.85, 0.90, 0.95, 1.01]
    for lo, hi in zip(buckets, buckets[1:]):
        count = sum(1 for s in scores if lo <= s < hi)
        bar = "#" * count
        console.print(f"  [{lo:.2f}-{hi:.2f}): {count:3d}  {bar}")