Adds quality-filter controls to the search UI: a "min score" select (any/2+/3+/4+/5) and a "hide banter" checkbox. Q/A hits gain a small color-coded usefulness badge (1-5, red->green) and a topic_class tag (computer-help, banter, off-topic, promo). Low-score and banter rows render dimmed by default so they're visible but de-emphasized. Defaults to "any" + banter visible to preserve existing search habits. Mike toggles up when he wants quality. URL-encoded params built via URLSearchParams so empty values don't leak into requests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
389 lines
14 KiB
Python
389 lines
14 KiB
Python
"""
|
|
Radio archive query server. Read-only FastAPI over the SQLite archive.db.
|
|
|
|
Endpoints:
|
|
GET / Landing page with search UI
|
|
GET /api/episodes List all episodes (year, title, duration)
|
|
GET /api/episodes/{id} Episode detail: intros + qa_pairs
|
|
GET /api/episodes/{id}/transcript Chronologically merged segments + turns
|
|
GET /api/search?q=...&kind=... FTS over segments and/or qa_pairs
|
|
GET /api/callers Top recurring caller_names
|
|
|
|
Config via env:
|
|
ARCHIVE_DB path to archive.db (default /data/archive.db)
|
|
PORT listen port (default 8765)
|
|
"""
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
from contextlib import asynccontextmanager
|
|
from pathlib import Path
|
|
|
|
from fastapi import FastAPI, HTTPException, Query
|
|
from fastapi.responses import FileResponse, HTMLResponse
|
|
|
|
DB_PATH = os.environ.get("ARCHIVE_DB", "/data/archive.db")
|
|
PORT = int(os.environ.get("PORT", "8765"))
|
|
|
|
|
|
def _connect() -> sqlite3.Connection:
|
|
if not Path(DB_PATH).exists():
|
|
raise RuntimeError(f"Archive DB not found at {DB_PATH}")
|
|
conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, check_same_thread=False)
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
app.state.db = _connect()
|
|
yield
|
|
app.state.db.close()
|
|
|
|
|
|
app = FastAPI(title="Computer Guru Radio Archive", lifespan=lifespan)
|
|
|
|
|
|
def fts_escape(q: str) -> str:
|
|
"""Wrap each term in double quotes so FTS5 treats reserved chars literally."""
|
|
return " ".join(f'"{tok}"' for tok in q.split() if tok)
|
|
|
|
|
|
@app.get("/api/episodes")
|
|
def list_episodes(year: int | None = None, limit: int = 1000):
|
|
db: sqlite3.Connection = app.state.db
|
|
sql = """
|
|
SELECT id, year, title, air_date, ROUND(duration_sec/60.0,1) AS minutes,
|
|
(SELECT COUNT(*) FROM qa_pairs q WHERE q.episode_id = e.id) AS qa_count,
|
|
(SELECT COUNT(*) FROM intros i WHERE i.episode_id = e.id) AS intro_count
|
|
FROM episodes e
|
|
"""
|
|
params: list = []
|
|
if year is not None:
|
|
sql += " WHERE year = ?"
|
|
params.append(year)
|
|
sql += " ORDER BY COALESCE(air_date, '9999') ASC, title ASC LIMIT ?"
|
|
params.append(limit)
|
|
rows = db.execute(sql, params).fetchall()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
@app.get("/api/episodes/{episode_id}")
|
|
def episode_detail(episode_id: int):
|
|
db: sqlite3.Connection = app.state.db
|
|
ep = db.execute("SELECT * FROM episodes WHERE id = ?", (episode_id,)).fetchone()
|
|
if not ep:
|
|
raise HTTPException(404, "episode not found")
|
|
intros = db.execute(
|
|
"SELECT name, role_hint, intro_time_sec, affiliation, fillin_for, source_text "
|
|
"FROM intros WHERE episode_id = ? ORDER BY intro_time_sec",
|
|
(episode_id,),
|
|
).fetchall()
|
|
qa = db.execute(
|
|
"SELECT id, question_start_sec, question_end_sec, "
|
|
"answer_start_sec, answer_end_sec, "
|
|
"question_text, answer_text, caller_name, caller_role, topic, topic_tags, "
|
|
"usefulness_score, topic_class, is_banter "
|
|
"FROM qa_pairs WHERE episode_id = ? ORDER BY question_start_sec",
|
|
(episode_id,),
|
|
).fetchall()
|
|
return {
|
|
"episode": dict(ep),
|
|
"intros": [dict(r) for r in intros],
|
|
"qa_pairs": [
|
|
{**dict(r), "topic_tags": json.loads(r["topic_tags"] or "[]")} for r in qa
|
|
],
|
|
}
|
|
|
|
|
|
@app.get("/api/episodes/{episode_id}/transcript")
|
|
def episode_transcript(episode_id: int):
|
|
db: sqlite3.Connection = app.state.db
|
|
ep = db.execute("SELECT id, title, year FROM episodes WHERE id = ?", (episode_id,)).fetchone()
|
|
if not ep:
|
|
raise HTTPException(404, "episode not found")
|
|
segments = db.execute(
|
|
"SELECT seg_idx, start_sec, end_sec, text FROM segments "
|
|
"WHERE episode_id = ? ORDER BY seg_idx",
|
|
(episode_id,),
|
|
).fetchall()
|
|
turns = db.execute(
|
|
"SELECT speaker, start_sec, end_sec, confidence FROM turns "
|
|
"WHERE episode_id = ? ORDER BY start_sec",
|
|
(episode_id,),
|
|
).fetchall()
|
|
return {
|
|
"episode": dict(ep),
|
|
"segments": [dict(r) for r in segments],
|
|
"turns": [dict(r) for r in turns],
|
|
}
|
|
|
|
|
|
@app.get("/api/search")
|
|
def search(
|
|
q: str = Query(..., min_length=2),
|
|
kind: str = Query("both", pattern="^(both|segments|qa)$"),
|
|
limit: int = Query(50, ge=1, le=500),
|
|
min_score: int = Query(0, ge=0, le=5,
|
|
description="Minimum usefulness_score for Q&A hits (0=no filter)"),
|
|
exclude_banter: bool = Query(False,
|
|
description="Drop Q&A rows where is_banter=1"),
|
|
):
|
|
db: sqlite3.Connection = app.state.db
|
|
fts_q = fts_escape(q)
|
|
if not fts_q:
|
|
return {"q": q, "segments": [], "qa": []}
|
|
|
|
seg_results = []
|
|
qa_results = []
|
|
|
|
if kind in ("both", "segments"):
|
|
seg_results = [
|
|
dict(r) for r in db.execute(
|
|
"""
|
|
SELECT e.id AS episode_id, e.year, e.title, e.air_date,
|
|
s.start_sec, s.end_sec,
|
|
snippet(segments_fts, 0, '<mark>', '</mark>', '...', 16) AS snippet,
|
|
bm25(segments_fts) AS rank
|
|
FROM segments_fts
|
|
JOIN segments s ON s.id = segments_fts.rowid
|
|
JOIN episodes e ON e.id = s.episode_id
|
|
WHERE segments_fts MATCH ?
|
|
ORDER BY rank LIMIT ?
|
|
""",
|
|
(fts_q, limit),
|
|
).fetchall()
|
|
]
|
|
|
|
if kind in ("both", "qa"):
|
|
# NULL is treated as "unscored, include" so unprocessed rows still
|
|
# appear and old saved URLs keep working as the classifier rolls out.
|
|
# Filters are applied as additional WHERE clauses on top of the FTS
|
|
# MATCH; SQLite's planner can use idx_qa_usefulness once it's helpful.
|
|
qa_clauses = ["qa_fts MATCH :q"]
|
|
qa_params: dict[str, object] = {"q": fts_q, "limit": limit}
|
|
if min_score > 0:
|
|
qa_clauses.append(
|
|
"(p.usefulness_score IS NULL OR p.usefulness_score >= :min_score)"
|
|
)
|
|
qa_params["min_score"] = min_score
|
|
if exclude_banter:
|
|
qa_clauses.append("(p.is_banter IS NULL OR p.is_banter = 0)")
|
|
|
|
qa_sql = f"""
|
|
SELECT e.id AS episode_id, e.year, e.title, e.air_date,
|
|
p.id AS qa_id, p.caller_name,
|
|
p.question_start_sec, p.answer_start_sec,
|
|
p.usefulness_score, p.topic_class, p.is_banter,
|
|
snippet(qa_fts, 0, '<mark>', '</mark>', '...', 16) AS q_snippet,
|
|
snippet(qa_fts, 1, '<mark>', '</mark>', '...', 16) AS a_snippet,
|
|
bm25(qa_fts) AS rank
|
|
FROM qa_fts
|
|
JOIN qa_pairs p ON p.id = qa_fts.rowid
|
|
JOIN episodes e ON e.id = p.episode_id
|
|
WHERE {' AND '.join(qa_clauses)}
|
|
ORDER BY rank LIMIT :limit
|
|
"""
|
|
qa_results = [dict(r) for r in db.execute(qa_sql, qa_params).fetchall()]
|
|
|
|
return {"q": q, "segments": seg_results, "qa": qa_results}
|
|
|
|
|
|
@app.get("/api/callers")
|
|
def top_callers(limit: int = 50):
|
|
db: sqlite3.Connection = app.state.db
|
|
rows = db.execute(
|
|
"SELECT caller_name, COUNT(*) AS pairs FROM qa_pairs "
|
|
"WHERE caller_name IS NOT NULL "
|
|
"GROUP BY caller_name ORDER BY pairs DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
@app.get("/api/stats")
|
|
def stats():
|
|
db: sqlite3.Connection = app.state.db
|
|
counts = {
|
|
t: db.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
|
|
for t in ("episodes", "segments", "turns", "intros", "qa_pairs")
|
|
}
|
|
by_year = [
|
|
dict(r) for r in db.execute(
|
|
"SELECT year, COUNT(*) AS episodes, "
|
|
"ROUND(SUM(duration_sec)/3600.0, 1) AS hours "
|
|
"FROM episodes GROUP BY year ORDER BY year"
|
|
).fetchall()
|
|
]
|
|
return {"counts": counts, "by_year": by_year}
|
|
|
|
|
|
@app.get("/api/db.sqlite")
|
|
def download_db():
|
|
"""Stream the read-only archive.db for offline laptop sync.
|
|
|
|
Anyone who can reach /api/search can already read every transcript,
|
|
so exposing the underlying SQLite file adds no meaningful disclosure.
|
|
Sync side: curl -o archive.db <host>:<port>/api/db.sqlite
|
|
"""
|
|
if not Path(DB_PATH).exists():
|
|
raise HTTPException(404, "archive db not present")
|
|
return FileResponse(
|
|
DB_PATH,
|
|
media_type="application/vnd.sqlite3",
|
|
filename="archive.db",
|
|
)
|
|
|
|
|
|
@app.get("/", response_class=HTMLResponse)
|
|
def index():
|
|
return INDEX_HTML
|
|
|
|
|
|
INDEX_HTML = """<!doctype html>
|
|
<html lang=en>
|
|
<meta charset=utf-8>
|
|
<title>Computer Guru Radio Archive</title>
|
|
<style>
|
|
body { font: 14px/1.45 ui-sans-serif, system-ui; max-width: 920px; margin: 2em auto; padding: 0 1em; color: #222; }
|
|
h1 { margin: 0 0 .25em; }
|
|
.sub { color:#666; margin-bottom: 1.5em; }
|
|
input[type=search] { width: 100%; padding: .6em .8em; font-size: 16px; box-sizing: border-box; }
|
|
.controls { display:flex; gap:.5em; align-items:center; margin: .5em 0 1em; flex-wrap: wrap; }
|
|
.controls label { font-size: 13px; color:#555; }
|
|
.group { border-bottom: 1px solid #eee; padding: 1em 0; }
|
|
.group h3 { margin: 0 0 .25em; font-size: 13px; color:#666; text-transform: uppercase; letter-spacing: .04em; }
|
|
.hit { padding: .5em 0; }
|
|
.hit .meta { font-size: 12px; color: #888; }
|
|
.hit a { color: #06c; text-decoration: none; }
|
|
.hit a:hover { text-decoration: underline; }
|
|
mark { background: #ffec99; padding: 0 .15em; }
|
|
.stats { font-size: 12px; color:#666; margin-top: 2em; }
|
|
.empty { color:#999; padding: 1em 0; }
|
|
.controls select, .controls input[type=checkbox] { margin-right: .35em; }
|
|
.badge { display: inline-block; min-width: 1.6em; padding: 0 .35em; margin-right: .35em;
|
|
font-size: 11px; font-weight: 600; text-align: center; border-radius: 3px;
|
|
color: #fff; background: #999; vertical-align: 1px; }
|
|
.badge.s5 { background: #2a8f43; }
|
|
.badge.s4 { background: #5aa54b; }
|
|
.badge.s3 { background: #999; }
|
|
.badge.s2 { background: #c08a3a; }
|
|
.badge.s1 { background: #b85a4a; }
|
|
.topic { font-size: 11px; color: #888; padding: 0 .35em; border-radius: 3px;
|
|
background: #f0f0f0; }
|
|
.hit.dim { opacity: .55; }
|
|
</style>
|
|
<h1>Computer Guru Radio Archive</h1>
|
|
<div class=sub id=sub>...</div>
|
|
<input type=search id=q autofocus placeholder="search transcripts and Q&A — e.g. wireless, virus, BIOS">
|
|
<div class=controls>
|
|
<label><input type=radio name=kind value=both checked> both</label>
|
|
<label><input type=radio name=kind value=qa> Q&A only</label>
|
|
<label><input type=radio name=kind value=segments> transcript only</label>
|
|
<span style="border-left:1px solid #ddd; padding-left:.6em">
|
|
<label>min score
|
|
<select id=min_score>
|
|
<option value=0>any</option>
|
|
<option value=2>2+</option>
|
|
<option value=3>3+</option>
|
|
<option value=4>4+</option>
|
|
<option value=5>5</option>
|
|
</select>
|
|
</label>
|
|
<label><input type=checkbox id=exclude_banter> hide banter</label>
|
|
</span>
|
|
</div>
|
|
<div id=results></div>
|
|
<div class=stats id=stats></div>
|
|
<script>
|
|
const q = document.getElementById('q');
|
|
const results = document.getElementById('results');
|
|
const sub = document.getElementById('sub');
|
|
const stats = document.getElementById('stats');
|
|
|
|
fetch('/api/stats').then(r => r.json()).then(s => {
|
|
const c = s.counts;
|
|
sub.textContent = `${c.episodes} episodes / ${c.qa_pairs} Q&A pairs / ${c.intros} intros / ${c.segments.toLocaleString()} segments`;
|
|
});
|
|
|
|
let timer;
|
|
q.addEventListener('input', () => {
|
|
clearTimeout(timer);
|
|
timer = setTimeout(runSearch, 250);
|
|
});
|
|
document.querySelectorAll('input[name=kind]').forEach(el => el.addEventListener('change', runSearch));
|
|
document.getElementById('min_score').addEventListener('change', runSearch);
|
|
document.getElementById('exclude_banter').addEventListener('change', runSearch);
|
|
|
|
function fmtTime(s) {
|
|
if (s == null) return '';
|
|
const m = Math.floor(s/60), sec = Math.floor(s%60);
|
|
return `${m}:${sec.toString().padStart(2,'0')}`;
|
|
}
|
|
|
|
function escapeHtml(s) {
|
|
return (s ?? '').replace(/[&<>"']/g, c => ({
|
|
'&':'&','<':'<','>':'>','"':'"',"'":'''
|
|
}[c]));
|
|
}
|
|
|
|
async function runSearch() {
|
|
const term = q.value.trim();
|
|
if (term.length < 2) { results.innerHTML = ''; return; }
|
|
const kind = document.querySelector('input[name=kind]:checked').value;
|
|
const minScore = document.getElementById('min_score').value;
|
|
const excludeBanter = document.getElementById('exclude_banter').checked;
|
|
const params = new URLSearchParams({ q: term, kind, limit: '40' });
|
|
if (minScore !== '0') params.set('min_score', minScore);
|
|
if (excludeBanter) params.set('exclude_banter', 'true');
|
|
const r = await fetch(`/api/search?${params}`);
|
|
const j = await r.json();
|
|
let html = '';
|
|
if (j.qa.length) {
|
|
html += '<div class=group><h3>Q&A Pairs</h3>';
|
|
for (const h of j.qa) {
|
|
const ad = h.air_date ? ` (${h.air_date})` : '';
|
|
const cn = h.caller_name ? ` — ${escapeHtml(h.caller_name)}` : '';
|
|
const score = h.usefulness_score;
|
|
const topic = h.topic_class;
|
|
const banter = h.is_banter === 1;
|
|
const badge = score != null
|
|
? `<span class="badge s${score}" title="usefulness ${score}/5">${score}</span>`
|
|
: '';
|
|
const topicTag = topic
|
|
? `<span class=topic>${escapeHtml(topic)}</span> `
|
|
: '';
|
|
const dim = (score != null && score <= 2) || banter ? ' dim' : '';
|
|
html += `<div class="hit${dim}">
|
|
<div class=meta>${badge}${topicTag}${h.year} · ${escapeHtml(h.title)}${ad}${cn} · @ ${fmtTime(h.question_start_sec)}</div>
|
|
<div><b>Q:</b> ${h.q_snippet}</div>
|
|
<div><b>A:</b> ${h.a_snippet}</div>
|
|
</div>`;
|
|
}
|
|
html += '</div>';
|
|
}
|
|
if (j.segments.length) {
|
|
html += '<div class=group><h3>Transcript Segments</h3>';
|
|
for (const h of j.segments) {
|
|
const ad = h.air_date ? ` (${h.air_date})` : '';
|
|
html += `<div class=hit>
|
|
<div class=meta>${h.year} · ${h.title}${ad} · @ ${fmtTime(h.start_sec)}</div>
|
|
<div>${h.snippet}</div>
|
|
</div>`;
|
|
}
|
|
html += '</div>';
|
|
}
|
|
if (!j.qa.length && !j.segments.length) {
|
|
html = '<div class=empty>no hits</div>';
|
|
}
|
|
results.innerHTML = html;
|
|
}
|
|
</script>
|
|
</html>
|
|
"""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=PORT)
|