QAPair gets caller_name and caller_role fields populated by a new attach_caller_names(pairs, transcript_segments) helper. For each pair, finds the active opening intro at the question_start time (8s forward tolerance, no backward limit — a caller's call can run for 10+ minutes and the intro happens once at the start) and attaches the speaker name. Validation on 9-episode test set: 19/19 Q&A pairs (100%) now have caller names attached. Examples of corrections from oracle attribution: 2018-s10e18 @ 73:36 Christopher (was misattributed to "Tara") 2015-s7e19 @ 35:45 William (was misattributed to "Tara") 2010-05-08-hr1 Jackie x3, Bruce 2012-03-10-hr1 Adam x2 2016-s8e43 John, Doug 2017-s9e30 Tom, Denise x3, Charlie speaker_oracle.py: adds speaker_at(time, intros) helper used both by the existing resolve_speakers() and the new caller-name attachment. Also adds the "let's fit/bring/put X in/on" intro pattern variant (caught Charlie at 70:21 in 2017-s9e30 that "talk to X" missed). download_full_archive.py: SSH keepalive every 30s + per-file retry-on- failure (up to 3 attempts with reconnect). Earlier run hung on a dead connection at file 109 of 589 with no recovery; restarted run is now running at ~10 MB/s vs ~2-3 MB/s before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
130 lines
4.3 KiB
Python
130 lines
4.3 KiB
Python
"""
|
|
Download the full Computer Guru Show archive from IX server (172.16.3.10).
|
|
|
|
Mirrors the year-based directory structure as-is to archive-data/episodes/.
|
|
Resumable: skips files already present with matching size.
|
|
Requires Tailscale.
|
|
"""
|
|
import os
|
|
import sys
|
|
import time
|
|
import paramiko
|
|
from pathlib import Path
|
|
|
|
password = os.environ.get("IX_PASSWORD")
|
|
if not password:
|
|
print("IX_PASSWORD env var not set", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
LOCAL_ROOT = Path(__file__).parent / "archive-data" / "episodes"
|
|
LOCAL_ROOT.mkdir(parents=True, exist_ok=True)
|
|
|
|
REMOTE_ROOT = "/home/gurushow/public_html/archive"
|
|
YEARS = ["2010", "2011", "2012", "2014", "2015", "2016", "2017", "2018"]
|
|
|
|
def connect():
|
|
c = paramiko.SSHClient()
|
|
c.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
c.connect("172.16.3.10", username="root", password=password,
|
|
look_for_keys=False, allow_agent=False,
|
|
timeout=30, banner_timeout=30, auth_timeout=30)
|
|
transport = c.get_transport()
|
|
if transport is not None:
|
|
transport.set_keepalive(30) # send keepalive every 30s
|
|
s = c.open_sftp()
|
|
s.get_channel().settimeout(120) # per-operation timeout
|
|
return c, s
|
|
|
|
|
|
print(f"Connecting to 172.16.3.10...", flush=True)
|
|
client, sftp = connect()
|
|
print("Connected.", flush=True)
|
|
|
|
|
|
def list_remote_mp3s(year: str) -> list[str]:
|
|
cmd = f"find '{REMOTE_ROOT}/{year}' -iname '*.mp3' 2>/dev/null"
|
|
stdin, stdout, stderr = client.exec_command(cmd)
|
|
return [line.strip() for line in stdout.read().decode().splitlines() if line.strip()]
|
|
|
|
|
|
total_files = 0
|
|
total_bytes = 0
|
|
skipped_files = 0
|
|
skipped_bytes = 0
|
|
downloaded_files = 0
|
|
downloaded_bytes = 0
|
|
errors = []
|
|
|
|
t_start = time.monotonic()
|
|
|
|
for year in YEARS:
|
|
print(f"\n=== {year} ===", flush=True)
|
|
remote_paths = list_remote_mp3s(year)
|
|
print(f" {len(remote_paths)} MP3 files found on remote", flush=True)
|
|
|
|
for remote in remote_paths:
|
|
rel = remote[len(REMOTE_ROOT) + 1:]
|
|
local = LOCAL_ROOT / rel
|
|
local.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
remote_stat = sftp.stat(remote)
|
|
remote_size = remote_stat.st_size
|
|
except Exception as e:
|
|
errors.append(f"stat {remote}: {e}")
|
|
continue
|
|
|
|
total_files += 1
|
|
total_bytes += remote_size
|
|
|
|
if local.exists() and local.stat().st_size == remote_size:
|
|
skipped_files += 1
|
|
skipped_bytes += remote_size
|
|
continue
|
|
|
|
size_mb = remote_size / 1024 / 1024
|
|
print(f" [{downloaded_files + 1:3d}] {rel} ({size_mb:.1f} MB)...", end="", flush=True)
|
|
t0 = time.monotonic()
|
|
|
|
attempt = 0
|
|
while True:
|
|
attempt += 1
|
|
try:
|
|
sftp.get(remote, str(local))
|
|
elapsed = time.monotonic() - t0
|
|
mbps = size_mb / elapsed if elapsed > 0 else 0
|
|
print(f" done ({elapsed:.1f}s, {mbps:.1f} MB/s)", flush=True)
|
|
downloaded_files += 1
|
|
downloaded_bytes += remote_size
|
|
break
|
|
except Exception as e:
|
|
if attempt >= 3:
|
|
print(f" FAILED after {attempt} attempts: {e}", flush=True)
|
|
errors.append(f"get {remote}: {e}")
|
|
break
|
|
print(f" retry {attempt} ({e})...", end="", flush=True)
|
|
# Reconnect on failure
|
|
try:
|
|
sftp.close()
|
|
client.close()
|
|
except Exception:
|
|
pass
|
|
time.sleep(5)
|
|
client, sftp = connect()
|
|
|
|
elapsed_total = time.monotonic() - t_start
|
|
print(f"\n=== Summary ===", flush=True)
|
|
print(f" Total remote files : {total_files}", flush=True)
|
|
print(f" Total remote bytes : {total_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
|
|
print(f" Already present : {skipped_files} files / {skipped_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
|
|
print(f" Newly downloaded : {downloaded_files} files / {downloaded_bytes / 1024 / 1024 / 1024:.2f} GB", flush=True)
|
|
print(f" Errors : {len(errors)}", flush=True)
|
|
print(f" Wall time : {elapsed_total:.1f}s", flush=True)
|
|
if errors:
|
|
print(f"\n=== Errors ===", flush=True)
|
|
for e in errors[:20]:
|
|
print(f" {e}", flush=True)
|
|
|
|
sftp.close()
|
|
client.close()
|