""" Re-diarize the two 2018 episodes that had stale diarization, then patch them into the existing archive DB. Also times the run to validate the audio-preload optimization. """ import sys import os import time os.environ["PYTHONIOENCODING"] = "utf-8" if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") if hasattr(sys.stderr, "reconfigure"): sys.stderr.reconfigure(encoding="utf-8") os.environ["TRANSFORMERS_OFFLINE"] = "1" from pathlib import Path from src.gpu import ensure_cuda_libs ensure_cuda_libs() from src.config import load_config from src.diarizer import diarize, VoiceProfileStore from src.indexer import ArchiveIndex from src.qa_extractor import load_diarized_transcript, extract_qa_pairs, tag_qa_pairs_with_ollama from rich.console import Console import re, json console = Console() BASE = Path(__file__).parent EPISODES_DIR = BASE / "training-data" / "episodes" TRANSCRIPTS_DIR = BASE / "training-data" / "transcripts" DB_PATH = BASE / "archive" / "archive.db" config = load_config() voice_profiles = VoiceProfileStore( config.resolve_path(config.diarization.voice_profiles_dir) ) targets = ["2018-s10e17", "2018-s10e21"] episodes = [EPISODES_DIR / f"{stem}.mp3" for stem in targets] console.print("[bold]Re-diarizing 2018 episodes (optimized audio preload)[/bold]\n") total_audio_s = 0 total_wall_s = 0 for ep_path in episodes: if not ep_path.exists(): console.print(f"[red]Missing: {ep_path.name}[/red]") continue stem = ep_path.stem transcript_dir = TRANSCRIPTS_DIR / stem t0 = time.monotonic() result = diarize(ep_path, voice_profiles=voice_profiles, host_match_threshold=0.85) wall = time.monotonic() - t0 result.save(transcript_dir) audio_dur = result.turns[-1].end if result.turns else 0 rtf = audio_dur / wall if wall > 0 else 0 total_audio_s += audio_dur total_wall_s += wall speakers = result.speakers_ranked() console.print( f" {stem}: {len(result.turns)} turns | " + ", ".join(f"{s} ({t:.0f}s)" for s, t in speakers[:3]) + f" [{wall:.1f}s wall / {rtf:.1f}x realtime]" ) if total_wall_s > 0: console.print( f"\n[bold]Speed:[/bold] {total_audio_s:.0f}s audio in {total_wall_s:.1f}s " f"= {total_audio_s/total_wall_s:.1f}x realtime" ) # Patch just these two episodes into the existing DB console.print("\n[bold]Patching DB...[/bold]") def episode_id(stem): return re.sub(r"-hr\d$", "", stem, flags=re.IGNORECASE) with ArchiveIndex(DB_PATH) as idx: for ep_path in episodes: if not ep_path.exists(): continue stem = ep_path.stem transcript_dir = TRANSCRIPTS_DIR / stem transcript_path = transcript_dir / "transcript.json" diarization_path = transcript_dir / "diarization.json" if not transcript_path.exists(): console.print(f"[yellow]No transcript: {stem}[/yellow]") continue ep_id = episode_id(stem) with open(transcript_path) as f: td = json.load(f) duration = td.get("duration") date_m = re.search(r"(\d{4}-\d{2}-\d{2})", stem) date = date_m.group(1) if date_m else None segments = load_diarized_transcript(transcript_path, diarization_path) # Remove old rows then re-add. FTS5 content tables are rebuilt at the end. idx._conn.execute("DELETE FROM segments WHERE episode_id = ?", (ep_id,)) idx._conn.execute("DELETE FROM qa_pairs WHERE episode_id = ?", (ep_id,)) idx._conn.commit() idx.add_episode(ep_id, ep_path, date=date, duration=duration) # Bypass add_segments guard (it skips if rows already exist) idx._conn.executemany( "INSERT INTO segments (episode_id, seg_index, start, end, speaker, text) " "VALUES (?, ?, ?, ?, ?, ?)", [ (ep_id, i, s["start"], s["end"], s.get("speaker", "UNKNOWN"), s["text"]) for i, s in enumerate(segments) ] ) idx._conn.commit() host_segs = sum(1 for s in segments if s["speaker"] == "HOST") other_segs = len(segments) - host_segs console.print(f" {ep_id}: {len(segments)} segs (HOST={host_segs}, other={other_segs})") pairs = extract_qa_pairs(segments) console.print(f" {len(pairs)} Q&A pairs", end="") if pairs: console.print(f" — tagging with Ollama...", end="") pairs = tag_qa_pairs_with_ollama( pairs, ollama_host=config.llm.ollama_host, model=config.llm.model ) for pair in pairs: idx.add_qa_pair( ep_id, pair.question_start, pair.question_end, pair.answer_start, pair.answer_end, pair.question_text, pair.answer_text, topic=pair.topic, tags=pair.topic_tags, ) console.print() # Rebuild FTS indexes — required after manual DELETE/re-INSERT on content tables idx._conn.execute("INSERT INTO segments_fts(segments_fts) VALUES('rebuild')") idx._conn.execute("INSERT INTO qa_fts(qa_fts) VALUES('rebuild')") idx._conn.commit() stats = idx.stats() console.print(f"\n[bold green]Done.[/bold green] DB now: " f"{stats['episodes']} episodes | " f"{stats['segments']} segments | " f"{stats['qa_pairs']} Q&A pairs")