Files
claudetools/projects/radio-show/audio-processor/download_full_archive.py
Mike Swanson 488bf5849e radio: attach caller names to Q&A pairs from transcript intros
QAPair gets caller_name and caller_role fields populated by a new
attach_caller_names(pairs, transcript_segments) helper. For each pair,
finds the active opening intro at the question_start time (8s forward
tolerance, no backward limit — a caller's call can run for 10+ minutes
and the intro happens once at the start) and attaches the speaker name.

Validation on 9-episode test set:
  19/19 Q&A pairs (100%) now have caller names attached.

Examples of corrections from oracle attribution:
  2018-s10e18 @ 73:36  Christopher (was misattributed to "Tara")
  2015-s7e19 @ 35:45   William     (was misattributed to "Tara")
  2010-05-08-hr1       Jackie x3, Bruce
  2012-03-10-hr1       Adam x2
  2016-s8e43           John, Doug
  2017-s9e30           Tom, Denise x3, Charlie

speaker_oracle.py: adds speaker_at(time, intros) helper used both by the
existing resolve_speakers() and the new caller-name attachment. Also
adds the "let's fit/bring/put X in/on" intro pattern variant (caught
Charlie at 70:21 in 2017-s9e30 that "talk to X" missed).

download_full_archive.py: SSH keepalive every 30s + per-file retry-on-
failure (up to 3 attempts with reconnect). Earlier run hung on a dead
connection at file 109 of 589 with no recovery; restarted run is now
running at ~10 MB/s vs ~2-3 MB/s before.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:55:31 -07:00

130 lines
4.3 KiB
Python

"""
Download the full Computer Guru Show archive from IX server (172.16.3.10).
Mirrors the year-based directory structure as-is to archive-data/episodes/.
Resumable: skips files already present with matching size.
Requires Tailscale.
"""
import os
import sys
import time
import paramiko
from pathlib import Path
password = os.environ.get("IX_PASSWORD")
if not password:
print("IX_PASSWORD env var not set", file=sys.stderr)
sys.exit(1)
LOCAL_ROOT = Path(__file__).parent / "archive-data" / "episodes"
LOCAL_ROOT.mkdir(parents=True, exist_ok=True)
REMOTE_ROOT = "/home/gurushow/public_html/archive"
YEARS = ["2010", "2011", "2012", "2014", "2015", "2016", "2017", "2018"]
def connect():
c = paramiko.SSHClient()
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
c.connect("172.16.3.10", username="root", password=password,
look_for_keys=False, allow_agent=False,
timeout=30, banner_timeout=30, auth_timeout=30)
transport = c.get_transport()
if transport is not None:
transport.set_keepalive(30) # send keepalive every 30s
s = c.open_sftp()
s.get_channel().settimeout(120) # per-operation timeout
return c, s
print(f"Connecting to 172.16.3.10...", flush=True)
client, sftp = connect()
print("Connected.", flush=True)
def list_remote_mp3s(year: str) -> list[str]:
cmd = f"find '{REMOTE_ROOT}/{year}' -iname '*.mp3' 2>/dev/null"
stdin, stdout, stderr = client.exec_command(cmd)
return [line.strip() for line in stdout.read().decode().splitlines() if line.strip()]
total_files = 0
total_bytes = 0
skipped_files = 0
skipped_bytes = 0
downloaded_files = 0
downloaded_bytes = 0
errors = []
t_start = time.monotonic()
for year in YEARS:
print(f"\n=== {year} ===", flush=True)
remote_paths = list_remote_mp3s(year)
print(f" {len(remote_paths)} MP3 files found on remote", flush=True)
for remote in remote_paths:
rel = remote[len(REMOTE_ROOT) + 1:]
local = LOCAL_ROOT / rel
local.parent.mkdir(parents=True, exist_ok=True)
try:
remote_stat = sftp.stat(remote)
remote_size = remote_stat.st_size
except Exception as e:
errors.append(f"stat {remote}: {e}")
continue
total_files += 1
total_bytes += remote_size
if local.exists() and local.stat().st_size == remote_size:
skipped_files += 1
skipped_bytes += remote_size
continue
size_mb = remote_size / 1024 / 1024
print(f" [{downloaded_files + 1:3d}] {rel} ({size_mb:.1f} MB)...", end="", flush=True)
t0 = time.monotonic()
attempt = 0
while True:
attempt += 1
try:
sftp.get(remote, str(local))
elapsed = time.monotonic() - t0
mbps = size_mb / elapsed if elapsed > 0 else 0
print(f" done ({elapsed:.1f}s, {mbps:.1f} MB/s)", flush=True)
downloaded_files += 1
downloaded_bytes += remote_size
break
except Exception as e:
if attempt >= 3:
print(f" FAILED after {attempt} attempts: {e}", flush=True)
errors.append(f"get {remote}: {e}")
break
print(f" retry {attempt} ({e})...", end="", flush=True)
# Reconnect on failure
try:
sftp.close()
client.close()
except Exception:
pass
time.sleep(5)
client, sftp = connect()
elapsed_total = time.monotonic() - t_start
print(f"\n=== Summary ===", flush=True)
print(f" Total remote files : {total_files}", flush=True)
print(f" Total remote bytes : {total_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
print(f" Already present : {skipped_files} files / {skipped_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
print(f" Newly downloaded : {downloaded_files} files / {downloaded_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
print(f" Errors : {len(errors)}", flush=True)
print(f" Wall time : {elapsed_total:.1f}s", flush=True)
if errors:
print(f"\n=== Errors ===", flush=True)
for e in errors[:20]:
print(f" {e}", flush=True)
sftp.close()
client.close()