radio: utf-8 transcript writes + sqlite archive importer + session log

- src/transcriber.py: open transcript.{json,txt,srt} with encoding="utf-8".
  Windows cp1252 default crashed on Whisper output containing U+2044.
- import_to_sqlite.py: new. Walks archive-data/transcripts, builds
  archive.db (5 tables + 2 FTS5 virtual tables, sha256-keyed idempotency).
  20.5 MB / 208 episodes at smoke-test time, 1.9s rebuild.
- batch_process.py: tracked from prior session — full-archive batch with
  resumable transcribe/diarize/intros/qa pipeline.
- .gitignore: archive-data/ and logs/.

Session log: 2026-04-27-archive-batch-and-sqlite-import.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 19:38:02 -07:00
parent 488bf5849e
commit 82940d96d7
5 changed files with 835 additions and 3 deletions

View File

@@ -10,6 +10,8 @@ test-data/episodes/
test-data/transcripts/
episodes/
processed/
archive-data/
logs/
# Databases (regenerable)
*.db

View File

@@ -0,0 +1,206 @@
"""
Batch-process the full archive: transcribe, diarize (with bumper filter +
current profiles), extract intros and Q&A pairs. Resumable — skips any
episode whose outputs already exist.
Output layout mirrors archive-data/episodes/ tree:
archive-data/episodes/<year>/.../<stem>.mp3
archive-data/transcripts/<year>/.../<stem>/{transcript,diarization,intros,qa}.json
Skips in-progress download files (modified within last 60s).
"""
import os
import sys
import time
import json
from pathlib import Path
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.config import load_config
from src.diarizer import diarize, VoiceProfileStore
from src.qa_extractor import (
load_diarized_transcript, extract_qa_pairs, attach_caller_names,
)
from src.speaker_oracle import extract_intros
from src.transcriber import transcribe as _transcribe
from rich.console import Console
console = Console()
BASE = Path(__file__).parent
EPISODES_ROOT = BASE / "archive-data" / "episodes"
TRANSCRIPTS_ROOT = BASE / "archive-data" / "transcripts"
INPROGRESS_GRACE_S = 60.0 # skip files modified within this many seconds
if not EPISODES_ROOT.exists():
console.print(f"[red]No episodes at {EPISODES_ROOT}[/red]")
sys.exit(1)
config = load_config()
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"[bold]Batch process[/bold] device={device} ({torch.cuda.get_device_name(0) if device=='cuda' else 'CPU'})")
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
def out_dir_for(mp3: Path) -> Path:
rel = mp3.relative_to(EPISODES_ROOT)
return TRANSCRIPTS_ROOT / rel.parent / rel.stem
def is_inprogress(mp3: Path) -> bool:
try:
mtime = mp3.stat().st_mtime
except FileNotFoundError:
return True
return (time.time() - mtime) < INPROGRESS_GRACE_S
# Collect all MP3s
all_mp3s = sorted(p for p in EPISODES_ROOT.rglob("*.mp3") if p.is_file())
all_mp3s += sorted(p for p in EPISODES_ROOT.rglob("*.MP3") if p.is_file())
all_mp3s = sorted(set(all_mp3s))
console.print(f"Found {len(all_mp3s)} MP3 files in {EPISODES_ROOT}")
t0_total = time.monotonic()
processed = 0
skipped_done = 0
skipped_inprogress = 0
errors: list[str] = []
# Aggregate intros across all transcribed episodes
all_intros: dict[str, dict] = {} # name -> {count, role_hints, episodes}
for idx, mp3 in enumerate(all_mp3s, 1):
rel = mp3.relative_to(EPISODES_ROOT)
if is_inprogress(mp3):
skipped_inprogress += 1
continue
out_dir = out_dir_for(mp3)
transcript_path = out_dir / "transcript.json"
diarization_path = out_dir / "diarization.json"
intros_path = out_dir / "intros.json"
qa_path = out_dir / "qa.json"
needs_transcribe = not transcript_path.exists()
needs_diarize = not diarization_path.exists()
needs_intros = not intros_path.exists()
needs_qa = not qa_path.exists()
if not (needs_transcribe or needs_diarize or needs_intros or needs_qa):
skipped_done += 1
continue
out_dir.mkdir(parents=True, exist_ok=True)
console.print(f"\n[{idx}/{len(all_mp3s)}] {rel}")
t0_ep = time.monotonic()
try:
if needs_transcribe:
t0 = time.monotonic()
console.print(" transcribing...")
transcript = _transcribe(mp3, model_size="large-v3", device=device, batch_size=16)
transcript.save(out_dir)
wall = time.monotonic() - t0
rtf = transcript.duration / wall if wall > 0 else 0
console.print(f" [green]transcribed: {transcript.duration:.0f}s in {wall:.1f}s ({rtf:.1f}x)[/green]")
if needs_diarize:
t0 = time.monotonic()
console.print(" diarizing...")
result = diarize(mp3, voice_profiles=voice_profiles,
host_match_threshold=0.85,
transcript_path=transcript_path)
result.save(out_dir)
wall = time.monotonic() - t0
audio_dur = result.turns[-1].end if result.turns else 0
rtf = audio_dur / wall if wall > 0 else 0
console.print(f" [green]diarized: {len(result.turns)} turns in {wall:.1f}s ({rtf:.1f}x)[/green]")
if needs_intros:
with open(transcript_path) as f:
tdata = json.load(f)
intros = extract_intros(tdata.get("segments", []))
with open(intros_path, "w") as f:
json.dump([
{
"name": i.name,
"role_hint": i.role_hint,
"intro_time": i.intro_time,
"affiliation": i.affiliation,
"fillin_for": i.fillin_for,
"source_text": i.source_text[:200],
} for i in intros
], f, indent=2)
for intro in intros:
rec = all_intros.setdefault(intro.name, {
"count": 0, "roles": set(), "episodes": set(),
})
rec["count"] += 1
rec["roles"].add(intro.role_hint)
rec["episodes"].add(str(rel))
console.print(f" [green]intros: {len(intros)} extracted[/green]")
if needs_qa:
with open(transcript_path) as f:
tdata = json.load(f)
segments = load_diarized_transcript(transcript_path, diarization_path)
pairs = extract_qa_pairs(segments)
attach_caller_names(pairs, tdata.get("segments", []))
with open(qa_path, "w") as f:
json.dump([p.to_dict() for p in pairs], f, indent=2)
named = sum(1 for p in pairs if p.caller_name)
console.print(f" [green]Q&A: {len(pairs)} pairs ({named} named)[/green]")
processed += 1
except Exception as e:
errors.append(f"{rel}: {e}")
console.print(f" [red]ERROR: {e}[/red]")
continue
if idx % 20 == 0:
elapsed = time.monotonic() - t0_total
console.print(f"\n[bold]progress[/bold] {idx}/{len(all_mp3s)} "
f"({processed} processed, {skipped_done} cached, {skipped_inprogress} in-progress, "
f"{len(errors)} errors) — {elapsed/60:.1f} min elapsed\n")
# Persist aggregated intro roster
roster_path = TRANSCRIPTS_ROOT / "intro_roster.json"
roster = {}
for name, rec in all_intros.items():
roster[name] = {
"count": rec["count"],
"roles": sorted(rec["roles"]),
"episode_count": len(rec["episodes"]),
"episodes": sorted(rec["episodes"])[:20], # cap to first 20 for readability
}
roster = dict(sorted(roster.items(), key=lambda x: -x[1]["count"]))
roster_path.parent.mkdir(parents=True, exist_ok=True)
with open(roster_path, "w") as f:
json.dump(roster, f, indent=2)
elapsed = time.monotonic() - t0_total
console.print(f"\n[bold green]=== Done ===[/bold green]")
console.print(f" processed : {processed}")
console.print(f" cached (skipped): {skipped_done}")
console.print(f" in-progress : {skipped_inprogress}")
console.print(f" errors : {len(errors)}")
console.print(f" wall time : {elapsed/60:.1f} min")
console.print(f" roster written : {roster_path} ({len(roster)} unique names)")
if errors:
console.print("\n[yellow]Errors:[/yellow]")
for e in errors[:20]:
console.print(f" {e}")

View File

@@ -0,0 +1,332 @@
"""
Import per-episode pipeline outputs (transcript / diarization / intros / qa)
into a single SQLite archive.db.
Idempotent: skips episodes whose transcript.json sha256 matches the recorded
hash. Re-run after each batch_process pass to keep the DB current.
Usage:
py import_to_sqlite.py [--db PATH] [--root PATH] [--rebuild] [--vacuum]
"""
import argparse
import hashlib
import json
import re
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
BASE = Path(__file__).parent
DEFAULT_ROOT = BASE / "archive-data" / "transcripts"
DEFAULT_DB = BASE / "archive-data" / "archive.db"
REQUIRED_FILES = ("transcript.json", "diarization.json", "intros.json", "qa.json")
SCHEMA = """
CREATE TABLE IF NOT EXISTS episodes (
id INTEGER PRIMARY KEY,
rel_path TEXT NOT NULL UNIQUE,
year INTEGER NOT NULL,
title TEXT,
air_date TEXT,
duration_sec REAL NOT NULL,
language TEXT,
language_probability REAL,
num_speakers INTEGER,
transcript_sha256 TEXT NOT NULL,
processed_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_episodes_year ON episodes(year);
CREATE INDEX IF NOT EXISTS idx_episodes_air_date ON episodes(air_date);
CREATE TABLE IF NOT EXISTS segments (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
seg_idx INTEGER NOT NULL,
start_sec REAL,
end_sec REAL,
text TEXT NOT NULL,
UNIQUE(episode_id, seg_idx)
);
CREATE INDEX IF NOT EXISTS idx_segments_episode ON segments(episode_id, start_sec);
CREATE TABLE IF NOT EXISTS turns (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
speaker TEXT NOT NULL,
start_sec REAL,
end_sec REAL,
confidence REAL
);
CREATE INDEX IF NOT EXISTS idx_turns_episode ON turns(episode_id, start_sec);
CREATE INDEX IF NOT EXISTS idx_turns_speaker ON turns(episode_id, speaker);
CREATE TABLE IF NOT EXISTS intros (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
name TEXT NOT NULL,
role_hint TEXT,
intro_time_sec REAL,
affiliation TEXT,
fillin_for TEXT,
source_text TEXT
);
CREATE INDEX IF NOT EXISTS idx_intros_episode ON intros(episode_id);
CREATE INDEX IF NOT EXISTS idx_intros_name ON intros(name);
CREATE TABLE IF NOT EXISTS qa_pairs (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
question_start_sec REAL,
question_end_sec REAL,
answer_start_sec REAL,
answer_end_sec REAL,
question_text TEXT NOT NULL,
answer_text TEXT NOT NULL,
caller_name TEXT,
caller_role TEXT,
topic TEXT,
topic_tags TEXT
);
CREATE INDEX IF NOT EXISTS idx_qa_episode ON qa_pairs(episode_id);
CREATE INDEX IF NOT EXISTS idx_qa_caller ON qa_pairs(caller_name);
CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
text,
content='segments', content_rowid='id',
tokenize='porter unicode61'
);
CREATE VIRTUAL TABLE IF NOT EXISTS qa_fts USING fts5(
question_text, answer_text,
content='qa_pairs', content_rowid='id',
tokenize='porter unicode61'
);
"""
TRIGGERS = """
CREATE TRIGGER IF NOT EXISTS segments_ai AFTER INSERT ON segments BEGIN
INSERT INTO segments_fts(rowid, text) VALUES (new.id, new.text);
END;
CREATE TRIGGER IF NOT EXISTS segments_ad AFTER DELETE ON segments BEGIN
INSERT INTO segments_fts(segments_fts, rowid, text) VALUES('delete', old.id, old.text);
END;
CREATE TRIGGER IF NOT EXISTS qa_ai AFTER INSERT ON qa_pairs BEGIN
INSERT INTO qa_fts(rowid, question_text, answer_text)
VALUES (new.id, new.question_text, new.answer_text);
END;
CREATE TRIGGER IF NOT EXISTS qa_ad AFTER DELETE ON qa_pairs BEGIN
INSERT INTO qa_fts(qa_fts, rowid, question_text, answer_text)
VALUES('delete', old.id, old.question_text, old.answer_text);
END;
"""
# Most → least specific
DATE_PATTERNS = [
re.compile(r"(?P<y>20\d{2})[-_](?P<m>\d{1,2})[-_](?P<d>\d{1,2})"),
re.compile(r"(?:^|[^\d])(?P<m>\d{1,2})-(?P<d>\d{1,2})-(?P<yy>\d{2})(?:[^\d]|$)"),
]
def init_schema(conn: sqlite3.Connection):
conn.executescript(SCHEMA)
conn.executescript(TRIGGERS)
conn.execute("PRAGMA foreign_keys = ON")
conn.commit()
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def parse_air_date(rel_dir: Path) -> str | None:
s = rel_dir.as_posix()
for pat in DATE_PATTERNS:
m = pat.search(s)
if not m:
continue
gd = m.groupdict()
try:
if gd.get("y"):
y, mo, d = int(gd["y"]), int(gd["m"]), int(gd["d"])
else:
yy = int(gd["yy"])
y = 2000 + yy if yy < 30 else 1900 + yy
mo, d = int(gd["m"]), int(gd["d"])
return datetime(y, mo, d).date().isoformat()
except ValueError:
continue
return None
def import_episode(conn: sqlite3.Connection, ep_dir: Path, root: Path) -> str:
rel_dir = ep_dir.relative_to(root)
rel_path = rel_dir.as_posix() + ".mp3"
year_str = rel_dir.parts[0]
if not (year_str.isdigit() and len(year_str) == 4):
return "skipped"
year = int(year_str)
title = rel_dir.name
air_date = parse_air_date(rel_dir)
transcript_path = ep_dir / "transcript.json"
sha = sha256_file(transcript_path)
row = conn.execute(
"SELECT id, transcript_sha256 FROM episodes WHERE rel_path = ?",
(rel_path,),
).fetchone()
if row and row[1] == sha:
return "skipped"
with open(transcript_path, encoding="utf-8") as f:
transcript = json.load(f)
with open(ep_dir / "diarization.json", encoding="utf-8") as f:
diarization = json.load(f)
with open(ep_dir / "intros.json", encoding="utf-8") as f:
intros = json.load(f)
with open(ep_dir / "qa.json", encoding="utf-8") as f:
qa = json.load(f)
with conn:
if row:
conn.execute("DELETE FROM episodes WHERE id = ?", (row[0],))
status = "updated"
else:
status = "inserted"
cur = conn.execute(
"""INSERT INTO episodes
(rel_path, year, title, air_date, duration_sec, language,
language_probability, num_speakers, transcript_sha256, processed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
rel_path, year, title, air_date,
transcript.get("duration", 0.0),
transcript.get("language"),
transcript.get("language_probability"),
diarization.get("num_speakers"),
sha,
datetime.now(timezone.utc).isoformat(timespec="seconds"),
),
)
episode_id = cur.lastrowid
conn.executemany(
"INSERT INTO segments (episode_id, seg_idx, start_sec, end_sec, text) "
"VALUES (?, ?, ?, ?, ?)",
[
(episode_id, s["id"], s.get("start"), s.get("end"), s["text"])
for s in transcript.get("segments", [])
],
)
conn.executemany(
"INSERT INTO turns (episode_id, speaker, start_sec, end_sec, confidence) "
"VALUES (?, ?, ?, ?, ?)",
[
(episode_id, t["speaker"], t.get("start"), t.get("end"), t.get("confidence"))
for t in diarization.get("turns", [])
],
)
conn.executemany(
"INSERT INTO intros "
"(episode_id, name, role_hint, intro_time_sec, affiliation, fillin_for, source_text) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
[
(episode_id, i["name"], i.get("role_hint"), i.get("intro_time"),
i.get("affiliation"), i.get("fillin_for"), i.get("source_text"))
for i in intros
],
)
conn.executemany(
"INSERT INTO qa_pairs "
"(episode_id, question_start_sec, question_end_sec, "
" answer_start_sec, answer_end_sec, "
" question_text, answer_text, caller_name, caller_role, topic, topic_tags) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
[
(episode_id,
p.get("question_start"), p.get("question_end"),
p.get("answer_start"), p.get("answer_end"),
p.get("question_text"), p.get("answer_text"),
p.get("caller_name"), p.get("caller_role"),
p.get("topic"), json.dumps(p.get("topic_tags") or []))
for p in qa
],
)
return status
def find_episode_dirs(root: Path):
for d in root.rglob("*"):
if not d.is_dir():
continue
if all((d / fn).exists() for fn in REQUIRED_FILES):
yield d
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--root", default=str(DEFAULT_ROOT))
ap.add_argument("--db", default=str(DEFAULT_DB))
ap.add_argument("--rebuild", action="store_true",
help="Delete the DB before import")
ap.add_argument("--vacuum", action="store_true",
help="VACUUM after import (slow on large DBs)")
args = ap.parse_args()
root = Path(args.root)
db = Path(args.db)
if args.rebuild and db.exists():
db.unlink()
print(f"deleted {db}")
db.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(db)
init_schema(conn)
n_inserted = n_updated = n_skipped = n_error = 0
t0 = time.monotonic()
dirs = sorted(find_episode_dirs(root))
print(f"Found {len(dirs)} complete episode directories under {root}")
for i, d in enumerate(dirs, 1):
try:
status = import_episode(conn, d, root)
if status == "inserted": n_inserted += 1
elif status == "updated": n_updated += 1
elif status == "skipped": n_skipped += 1
except Exception as e:
n_error += 1
print(f"ERROR {d.relative_to(root)}: {e}")
if i % 50 == 0:
print(f" {i}/{len(dirs)} ins={n_inserted} upd={n_updated} skip={n_skipped} err={n_error}")
conn.executescript("ANALYZE;")
if args.vacuum:
conn.executescript("VACUUM;")
conn.close()
size_mb = db.stat().st_size / 1024 / 1024
elapsed = time.monotonic() - t0
print(f"\n=== Done in {elapsed:.1f}s ===")
print(f" inserted : {n_inserted}")
print(f" updated : {n_updated}")
print(f" skipped : {n_skipped}")
print(f" errors : {n_error}")
print(f" db : {db} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,292 @@
# Session Log — 2026-04-27 (continuation #2)
**Project:** The Computer Guru Show — Archive Mining System
**Goal:** Resume archive download + batch transcribe/diarize after machine restart, then design + build the SQLite archive database
**Machine:** GURU-BEAST-ROG (RTX 4090, 24GB)
**User:** Mike Swanson (mike)
Companion to:
- `2026-04-27-diarization-pipeline.md` (DESKTOP-0O8A1RL — diarization fixes)
- `2026-04-27-4090-benchmark-and-test-set.md` (GURU-BEAST-ROG — 4090 perf + per-year test set)
---
## User
- **User:** Mike Swanson (mike)
- **Machine:** GURU-BEAST-ROG
- **Role:** admin
---
## Session Summary
The session focused on resuming interrupted archive processing and initiating the design of a SQLite database for the Computer Guru Radio Show. The machine had restarted during the execution of `download_full_archive.py` and `batch_process.py`, leaving the download partially complete and batch processing halted mid-year. The download had completed through 2015 with partial progress in 2016, and years 2017 and 2018 were entirely missing locally. batch_process had finished 2010 (43/43) and stopped at 21 of 200 episodes in 2011. Connectivity to the IX server was confirmed via Tailscale, the SOPS vault yielded the IX root password, and both jobs were restarted in the background.
The download successfully resumed via size-match skipping, then pulled the remaining 88 files (2.65 GB) covering late-2016 plus all of 2017 and 2018 in 30 minutes wall time, 0 errors. batch_process picked up at episode 65/519 of its file-list snapshot but immediately tripped a `'charmap' codec can't encode character ''` error: `src/transcriber.py` opened `transcript.txt` and `transcript.srt` with Windows default cp1252 encoding, which cannot represent Whisper's U+2044 fraction-slash output. The fix was a one-line addition (`encoding="utf-8"`) on three `open()` calls. The partial output dir for episode 65 was deleted to force a clean redo, and batch_process was restarted.
The user then raised an architectural question about where the canonical archive database should live. Discussion converged on SQLite (over MariaDB) because the per-episode JSONs are the source of truth and the `.db` is rebuildable in seconds, and on Jupiter+Docker (over IX cPanel) because the use case is internal-only and Tailscale already provides access; public exposure can be added later via cloudflared. The schema (5 tables + 2 FTS5 virtual tables) was designed and `import_to_sqlite.py` was written and smoke-tested against 208 currently-complete episodes — 1.9 seconds for full rebuild, 20.5 MB DB, FTS queries on "wireless" and "virus" returning correct snippets.
By session end, the download was complete and batch_process was at episode 211/519 (147 episodes transcribed since the encoding-fix restart). One final batch_process re-run is needed after the current 519-snapshot finishes, to pick up the 53 newly-downloaded files that were not in the startup snapshot.
---
## Key Decisions
- **SQLite over MariaDB**: per-episode JSONs are the source of truth, the `.db` is rebuildable in seconds. Can graduate to MariaDB later by re-importing from the same JSONs without losing anything.
- **Jupiter (Unraid Docker) over IX cPanel**: use case is internal-only show-prep search. Tailscale already covers access. IX is the right place for public-facing show-site content but adds shared-hosting friction the v1 doesn't need.
- **FTS5 with `porter unicode61` tokenizer**: porter stemming for English query expansion, unicode61 for case-folding and basic punctuation handling. External-content tables with content_rowid pointing back to `segments` and `qa_pairs` so the FTS index doesn't duplicate the text.
- **Skip speaker-name resolution view in v1**: turns table holds role labels (HOST/CO-HOST/CALLER/BUMPER), intros and qa_pairs hold real names. A SQL view that joins them by time-window is cheap to add later and no data is lost by deferring.
- **Keep BUMPER turns and promo-flagged segments raw**: filter at query time. Excluding them at insert loses signal that may matter for future analysis.
- **sha256 of transcript.json as idempotency key**: importer skips an episode whose recorded hash matches the on-disk file. Re-run the importer after each batch_process pass; it only does work for changed files.
- **Restart batch_process to fix encoding bug rather than --amend partial files**: the .json was correct (ensure_ascii=True default), but .txt and .srt were potentially truncated. Cleanest path was to delete the failed episode's whole output dir and let the pipeline regenerate everything with the encoding fix.
---
## Problems Encountered
- **`'charmap' codec` encoding error in transcriber.py**
- Cause: `open(... "w")` defaulted to Windows cp1252; Whisper output contained U+2044 (fraction slash) which cp1252 cannot encode.
- Fix: added `encoding="utf-8"` to the three open() calls at `src/transcriber.py:93,97,101`.
- Audited the rest of the pipeline: `diarizer.py`, `batch_process.py`, and other JSON writers use `json.dump` default `ensure_ascii=True`, which escapes unicode to ASCII before encoding — safe under cp1252 even without explicit utf-8. Only `transcriber.py` writes raw unicode (transcript.txt, transcript.srt).
- **Failed episode left inconsistent output**
- Episode 65 (`2011/10 - October/10-15-11 HR 2`) had `transcript.json` written successfully but `transcript.txt` truncated mid-encode.
- Fix: `rm -rf` the entire episode output dir; batch_process redoes it cleanly on next pass.
- **Monitor refired the same error every poll**
- Initial monitor used `grep -E "ERROR" $LOG | tail -1` each iteration, so a single historical error line emitted a notification every 60s.
- Fix: track error count between polls; only emit when count grows. Same pattern applied to download FAILED counts.
- **batch_process snapshot taken before download finished**
- `all_mp3s = ...` is computed once at startup. The 53 newly-downloaded MP3s (late-2016, 2017, 2018) are not visible to the currently-running batch.
- Mitigation: after current 519-snapshot run finishes, relaunch batch_process once. Resumability via existence-check makes the re-run only process the new files.
---
## Files Modified / Created
| Path | Change |
|---|---|
| `projects/radio-show/audio-processor/src/transcriber.py` | Added `encoding="utf-8"` to all three `open()` calls in `Transcript.save()` (lines 93, 97, 101) |
| `projects/radio-show/audio-processor/import_to_sqlite.py` | NEW. Walks archive-data/transcripts, imports JSONs into archive.db with FTS5. sha256-keyed idempotency. |
| `projects/radio-show/audio-processor/batch_process.py` | (already untracked from prior session — no edits this session) |
| `projects/radio-show/audio-processor/archive-data/episodes/{2010..2018}/` | Filled in by download_full_archive.py — 88 new files |
| `projects/radio-show/audio-processor/archive-data/transcripts/{2010,2011}/...` | Per-episode output dirs — written by batch_process |
| `projects/radio-show/audio-processor/archive-data/archive.db` | NEW (smoke-test rebuild, 20.5 MB at 208 episodes) |
| `projects/radio-show/audio-processor/logs/download.log` | Background download output |
| `projects/radio-show/audio-processor/logs/batch_process.log` | Background batch output |
---
## SQLite Schema (full DDL)
```sql
CREATE TABLE episodes (
id INTEGER PRIMARY KEY,
rel_path TEXT NOT NULL UNIQUE,
year INTEGER NOT NULL,
title TEXT,
air_date TEXT,
duration_sec REAL NOT NULL,
language TEXT,
language_probability REAL,
num_speakers INTEGER,
transcript_sha256 TEXT NOT NULL,
processed_at TEXT NOT NULL
);
CREATE INDEX idx_episodes_year ON episodes(year);
CREATE INDEX idx_episodes_air_date ON episodes(air_date);
CREATE TABLE segments (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
seg_idx INTEGER NOT NULL,
start_sec REAL, end_sec REAL,
text TEXT NOT NULL,
UNIQUE(episode_id, seg_idx)
);
CREATE INDEX idx_segments_episode ON segments(episode_id, start_sec);
CREATE TABLE turns (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
speaker TEXT NOT NULL, -- HOST / CO-HOST / CALLER / BUMPER
start_sec REAL, end_sec REAL,
confidence REAL
);
CREATE INDEX idx_turns_episode ON turns(episode_id, start_sec);
CREATE INDEX idx_turns_speaker ON turns(episode_id, speaker);
CREATE TABLE intros (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
name TEXT NOT NULL,
role_hint TEXT, -- caller / cohost / fillin
intro_time_sec REAL,
affiliation TEXT, fillin_for TEXT,
source_text TEXT
);
CREATE INDEX idx_intros_episode ON intros(episode_id);
CREATE INDEX idx_intros_name ON intros(name);
CREATE TABLE qa_pairs (
id INTEGER PRIMARY KEY,
episode_id INTEGER NOT NULL REFERENCES episodes(id) ON DELETE CASCADE,
question_start_sec REAL, question_end_sec REAL,
answer_start_sec REAL, answer_end_sec REAL,
question_text TEXT NOT NULL,
answer_text TEXT NOT NULL,
caller_name TEXT, caller_role TEXT,
topic TEXT, topic_tags TEXT -- JSON array as TEXT
);
CREATE INDEX idx_qa_episode ON qa_pairs(episode_id);
CREATE INDEX idx_qa_caller ON qa_pairs(caller_name);
CREATE VIRTUAL TABLE segments_fts USING fts5(
text, content='segments', content_rowid='id',
tokenize='porter unicode61'
);
CREATE VIRTUAL TABLE qa_fts USING fts5(
question_text, answer_text,
content='qa_pairs', content_rowid='id',
tokenize='porter unicode61'
);
-- + standard ai/ad triggers to keep FTS in sync on insert/delete
```
---
## Smoke-Test Results (post-import, mid-batch)
```
Found 208 complete episode directories under archive-data/transcripts/
inserted : 208
updated : 0
skipped : 0
errors : 0
db : archive-data/archive.db (20.5 MB)
wall : 1.9 seconds
```
| Year | Episodes | Hours |
|---|---|---|
| 2010 | 43 | 32.1 |
| 2011 | 165 | 122.2 |
| **Total at smoke-test time** | **208** | **154.3** |
| Table | Rows |
|---|---|
| episodes | 208 |
| segments | 19,745 |
| turns | 7,233 |
| intros | 1,117 |
| qa_pairs | 566 |
Air-date parsed for 204/208 episodes (4 misses are season/episode-format filenames like `s7e30` with no calendar date — accepted).
FTS5 queries verified:
- `segments MATCH 'wireless'` returned 3 hits with correct episode attribution and snippets
- `qa MATCH 'virus'` returned 3 hits with correct episode attribution
---
## Download Run — Final Stats
```
=== Summary ===
Total remote files : 589
Total remote bytes : 7.53 GB
Already present : 501 files / 4.88 GB
Newly downloaded : 88 files / 2.65 GB
Errors : 0
Wall time : 1799.3s
```
| Year | Local MP3 count |
|---|---|
| 2010 | 43 |
| 2011 | 200 |
| 2012 | 98 |
| 2014 | 81 |
| 2015 | 50 |
| 2016 | 54 |
| 2017 | 41 |
| 2018 | 5 |
| **Total** | **572** |
(572 vs 589-remote-total: 17-file delta is case-variant duplicates `.MP3`/`.mp3` already counted under one local name, not missing files.)
---
## Credentials
### IX Server (archive source)
- **Vault path:** `infrastructure/ix-server.sops.yaml`
- **Host:** 172.16.3.10 (Tailscale required)
- **External:** ix.azcomputerguru.com / 72.194.62.5
- **SSH port:** 22
- **OS:** Rocky Linux (WHM/cPanel; WHM 2087, cPanel 2083)
- **Username:** root
- **Password:** `Gptf*77ttb!@#!@#`
- **Notes:** Use paramiko with `look_for_keys=False, allow_agent=False, timeout=30, banner_timeout=30, auth_timeout=30`. Set `transport.set_keepalive(30)` and `sftp.get_channel().settimeout(120)` for long sessions. SSH from command line is blocked by key-agent interference on this machine.
### Jupiter (Unraid — planned destination for archive.db)
- **Vault path:** `infrastructure/jupiter-unraid-primary.sops.yaml`
- (Container setup pending — no work done yet, just architectural decision)
---
## Infrastructure & Paths
| Resource | Value |
|---|---|
| Audio processor root | `c:\Users\guru\ClaudeTools\projects\radio-show\audio-processor\` |
| Episodes root (local) | `archive-data/episodes/<year>/...` |
| Transcripts root (local) | `archive-data/transcripts/<year>/.../<stem>/` |
| Archive DB (local) | `archive-data/archive.db` |
| Per-episode outputs | `transcript.json`, `transcript.txt`, `transcript.srt`, `diarization.json`, `intros.json`, `qa.json` |
| Voice profiles | `voice-profiles/` (181 profiles loaded by current run) |
| Background log dir | `logs/` (download.log, batch_process.log) |
| Remote archive root | `/home/gurushow/public_html/archive/{2010-2018}/` on IX |
| Planned Jupiter dir | `/mnt/user/appdata/radio-archive/` |
---
## Commands Run (key invocations)
```bash
# Resume download (from audio-processor dir, in venv)
IX_PASSWORD='Gptf*77ttb!@#!@#' .venv/Scripts/python.exe download_full_archive.py > logs/download.log 2>&1
# Resume batch transcribe + diarize (no env needed)
.venv/Scripts/python.exe batch_process.py >> logs/batch_process.log 2>&1
# Initial DB build / smoke test
.venv/Scripts/python.exe import_to_sqlite.py --rebuild
# Subsequent incremental imports (after each batch_process pass)
.venv/Scripts/python.exe import_to_sqlite.py
```
---
## Pending / Next Up
1. **Wait for current batch_process to finish** the 519-file snapshot (currently at 211/519, 147 transcribed since restart).
2. **Re-launch batch_process once more** — picks up the 53 new MP3s downloaded after the snapshot was taken (5 late-2016 + 41 in 2017 + 5 in 2018 + 2 stragglers).
3. **Re-run import_to_sqlite.py** (incremental, idempotent — only the new ones do real work).
4. **Stand up the Jupiter Docker container**:
- Create `/mnt/user/appdata/radio-archive/` on Jupiter
- Define container (FastAPI + sqlite, ~50 lines) — read-only mount of `archive.db`
- Expose only on Tailscale interface, not on the public IP
- rsync `archive.db` from GURU-BEAST-ROG to Jupiter as the deploy step
5. **Decide on speaker-name resolution view** once query patterns emerge.
6. **(Future)** profile-build for Randall, Rob, and named producers (Andrew/Shannon/Ken) so non-Mike-non-Tara speakers stop falling into the CALLER bucket. Per the prior session log, this is what's inflating Q&A false-positive rates in early-years and 2018/2019 episodes.
---
## Reference Information
- **Encoding rule for Windows Python:** any `open(...)` that may write or read non-ASCII text (transcripts, captions, raw text dumps) must specify `encoding="utf-8"`. JSON writes via `json.dump` with default `ensure_ascii=True` are safe but defensive `encoding="utf-8"` doesn't hurt.
- **batch_process resumability:** existence-check on all four output JSONs. To force a redo, delete the episode's output directory.
- **Importer resumability:** sha256 of `transcript.json` recorded per episode. Hash mismatch → cascade-delete + reinsert in one transaction.
- **FTS5 trigger pattern (external content):** `INSERT INTO fts(rowid, ...)` for ai trigger; `INSERT INTO fts(fts, rowid, ...) VALUES('delete', ...)` for ad trigger. Same column count for both.
- **Per-year MP3 totals on IX:** 2010 (52), 2011 (200), 2012 (98), 2014 (81), 2015 (50), 2016 (54), 2017 (41), 2018 (5) — note 2013 directory does not exist on the source.

View File

@@ -90,15 +90,15 @@ class Transcript:
output_dir.mkdir(parents=True, exist_ok=True)
# JSON with full detail
with open(output_dir / "transcript.json", "w") as f:
with open(output_dir / "transcript.json", "w", encoding="utf-8") as f:
json.dump(self.to_dict(), f, indent=2)
# Plain text
with open(output_dir / "transcript.txt", "w") as f:
with open(output_dir / "transcript.txt", "w", encoding="utf-8") as f:
f.write(self.full_text)
# SRT subtitles
with open(output_dir / "transcript.srt", "w") as f:
with open(output_dir / "transcript.srt", "w", encoding="utf-8") as f:
f.write(self.to_srt())
console.print(f"[green]Transcript saved to {output_dir}[/green]")