""" Batch-process the full archive: transcribe, diarize (with bumper filter + current profiles), extract intros and Q&A pairs. Resumable — skips any episode whose outputs already exist. Output layout mirrors archive-data/episodes/ tree: archive-data/episodes//.../.mp3 archive-data/transcripts//...//{transcript,diarization,intros,qa}.json Skips in-progress download files (modified within last 60s). """ import os import sys import time import json from pathlib import Path os.environ["PYTHONIOENCODING"] = "utf-8" os.environ["TRANSFORMERS_OFFLINE"] = "1" if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") from src.gpu import ensure_cuda_libs ensure_cuda_libs() import torch from src.config import load_config from src.diarizer import diarize, VoiceProfileStore from src.qa_extractor import ( load_diarized_transcript, extract_qa_pairs, attach_caller_names, ) from src.speaker_oracle import extract_intros from src.transcriber import transcribe as _transcribe from rich.console import Console console = Console() BASE = Path(__file__).parent EPISODES_ROOT = BASE / "archive-data" / "episodes" TRANSCRIPTS_ROOT = BASE / "archive-data" / "transcripts" INPROGRESS_GRACE_S = 60.0 # skip files modified within this many seconds if not EPISODES_ROOT.exists(): console.print(f"[red]No episodes at {EPISODES_ROOT}[/red]") sys.exit(1) config = load_config() device = "cuda" if torch.cuda.is_available() else "cpu" console.print(f"[bold]Batch process[/bold] device={device} ({torch.cuda.get_device_name(0) if device=='cuda' else 'CPU'})") voice_profiles = VoiceProfileStore( config.resolve_path(config.diarization.voice_profiles_dir) ) def out_dir_for(mp3: Path) -> Path: rel = mp3.relative_to(EPISODES_ROOT) return TRANSCRIPTS_ROOT / rel.parent / rel.stem def is_inprogress(mp3: Path) -> bool: try: mtime = mp3.stat().st_mtime except FileNotFoundError: return True return (time.time() - mtime) < INPROGRESS_GRACE_S # Collect all MP3s all_mp3s = sorted(p for p in EPISODES_ROOT.rglob("*.mp3") if p.is_file()) all_mp3s += sorted(p for p in EPISODES_ROOT.rglob("*.MP3") if p.is_file()) all_mp3s = sorted(set(all_mp3s)) console.print(f"Found {len(all_mp3s)} MP3 files in {EPISODES_ROOT}") t0_total = time.monotonic() processed = 0 skipped_done = 0 skipped_inprogress = 0 errors: list[str] = [] # Aggregate intros across all transcribed episodes all_intros: dict[str, dict] = {} # name -> {count, role_hints, episodes} for idx, mp3 in enumerate(all_mp3s, 1): rel = mp3.relative_to(EPISODES_ROOT) if is_inprogress(mp3): skipped_inprogress += 1 continue out_dir = out_dir_for(mp3) transcript_path = out_dir / "transcript.json" diarization_path = out_dir / "diarization.json" intros_path = out_dir / "intros.json" qa_path = out_dir / "qa.json" needs_transcribe = not transcript_path.exists() needs_diarize = not diarization_path.exists() needs_intros = not intros_path.exists() needs_qa = not qa_path.exists() if not (needs_transcribe or needs_diarize or needs_intros or needs_qa): skipped_done += 1 continue out_dir.mkdir(parents=True, exist_ok=True) console.print(f"\n[{idx}/{len(all_mp3s)}] {rel}") t0_ep = time.monotonic() try: if needs_transcribe: t0 = time.monotonic() console.print(" transcribing...") transcript = _transcribe(mp3, model_size="large-v3", device=device, batch_size=16) transcript.save(out_dir) wall = time.monotonic() - t0 rtf = transcript.duration / wall if wall > 0 else 0 console.print(f" [green]transcribed: {transcript.duration:.0f}s in {wall:.1f}s ({rtf:.1f}x)[/green]") if needs_diarize: t0 = time.monotonic() console.print(" diarizing...") result = diarize(mp3, voice_profiles=voice_profiles, host_match_threshold=0.85, transcript_path=transcript_path) result.save(out_dir) wall = time.monotonic() - t0 audio_dur = result.turns[-1].end if result.turns else 0 rtf = audio_dur / wall if wall > 0 else 0 console.print(f" [green]diarized: {len(result.turns)} turns in {wall:.1f}s ({rtf:.1f}x)[/green]") if needs_intros: with open(transcript_path) as f: tdata = json.load(f) intros = extract_intros(tdata.get("segments", [])) with open(intros_path, "w") as f: json.dump([ { "name": i.name, "role_hint": i.role_hint, "intro_time": i.intro_time, "affiliation": i.affiliation, "fillin_for": i.fillin_for, "source_text": i.source_text[:200], } for i in intros ], f, indent=2) for intro in intros: rec = all_intros.setdefault(intro.name, { "count": 0, "roles": set(), "episodes": set(), }) rec["count"] += 1 rec["roles"].add(intro.role_hint) rec["episodes"].add(str(rel)) console.print(f" [green]intros: {len(intros)} extracted[/green]") if needs_qa: with open(transcript_path) as f: tdata = json.load(f) segments = load_diarized_transcript(transcript_path, diarization_path) pairs = extract_qa_pairs(segments) attach_caller_names(pairs, tdata.get("segments", [])) with open(qa_path, "w") as f: json.dump([p.to_dict() for p in pairs], f, indent=2) named = sum(1 for p in pairs if p.caller_name) console.print(f" [green]Q&A: {len(pairs)} pairs ({named} named)[/green]") processed += 1 except Exception as e: errors.append(f"{rel}: {e}") console.print(f" [red]ERROR: {e}[/red]") continue if idx % 20 == 0: elapsed = time.monotonic() - t0_total console.print(f"\n[bold]progress[/bold] {idx}/{len(all_mp3s)} " f"({processed} processed, {skipped_done} cached, {skipped_inprogress} in-progress, " f"{len(errors)} errors) — {elapsed/60:.1f} min elapsed\n") # Persist aggregated intro roster roster_path = TRANSCRIPTS_ROOT / "intro_roster.json" roster = {} for name, rec in all_intros.items(): roster[name] = { "count": rec["count"], "roles": sorted(rec["roles"]), "episode_count": len(rec["episodes"]), "episodes": sorted(rec["episodes"])[:20], # cap to first 20 for readability } roster = dict(sorted(roster.items(), key=lambda x: -x[1]["count"])) roster_path.parent.mkdir(parents=True, exist_ok=True) with open(roster_path, "w") as f: json.dump(roster, f, indent=2) elapsed = time.monotonic() - t0_total console.print(f"\n[bold green]=== Done ===[/bold green]") console.print(f" processed : {processed}") console.print(f" cached (skipped): {skipped_done}") console.print(f" in-progress : {skipped_inprogress}") console.print(f" errors : {len(errors)}") console.print(f" wall time : {elapsed/60:.1f} min") console.print(f" roster written : {roster_path} ({len(roster)} unique names)") if errors: console.print("\n[yellow]Errors:[/yellow]") for e in errors[:20]: console.print(f" {e}")