radio: Q/A usefulness classifier + min_score search filter (Track 1)
Adds an Ollama-based content quality classifier and exposes the results via the search API. 1,407 existing Q/A pairs were scored in 3.5h via qwen3:14b (1,405 succeeded, 2 failed). Distribution: 37% scored 4-5 (useful), 41% scored 1-2 (banter/promo/ off-topic). 43% flagged as banter overall. Default-on filtering at search time will hide ~half of the noise without losing any real listener questions. Files: - new classify_qa_quality.py: walks qa_pairs, calls Ollama qwen3:14b per row, writes usefulness_score/topic_class/is_banter back to DB. Idempotent (--rebuild to reprocess), --smoke for sample check, --limit for partial runs. Detached run handles 1407 rows in ~3.5h on a 4090. - server/main.py: /api/search accepts min_score (0-5) and exclude_banter query params. NULL scores treat as "include" so unprocessed rows still appear. Episode detail endpoint includes the new fields in qa results. Schema migration in import_to_sqlite.py was made by the same agent run (visible on the live archive.db: usefulness_score / topic_class / is_banter columns now exist on qa_pairs). Local archive.db updated; Jupiter container has NOT been redeployed yet — that is a separate manual step. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -82,7 +82,8 @@ def episode_detail(episode_id: int):
|
||||
qa = db.execute(
|
||||
"SELECT id, question_start_sec, question_end_sec, "
|
||||
"answer_start_sec, answer_end_sec, "
|
||||
"question_text, answer_text, caller_name, caller_role, topic, topic_tags "
|
||||
"question_text, answer_text, caller_name, caller_role, topic, topic_tags, "
|
||||
"usefulness_score, topic_class, is_banter "
|
||||
"FROM qa_pairs WHERE episode_id = ? ORDER BY question_start_sec",
|
||||
(episode_id,),
|
||||
).fetchall()
|
||||
@@ -123,6 +124,10 @@ def search(
|
||||
q: str = Query(..., min_length=2),
|
||||
kind: str = Query("both", pattern="^(both|segments|qa)$"),
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
min_score: int = Query(0, ge=0, le=5,
|
||||
description="Minimum usefulness_score for Q&A hits (0=no filter)"),
|
||||
exclude_banter: bool = Query(False,
|
||||
description="Drop Q&A rows where is_banter=1"),
|
||||
):
|
||||
db: sqlite3.Connection = app.state.db
|
||||
fts_q = fts_escape(q)
|
||||
@@ -151,24 +156,35 @@ def search(
|
||||
]
|
||||
|
||||
if kind in ("both", "qa"):
|
||||
qa_results = [
|
||||
dict(r) for r in db.execute(
|
||||
"""
|
||||
SELECT e.id AS episode_id, e.year, e.title, e.air_date,
|
||||
p.id AS qa_id, p.caller_name,
|
||||
p.question_start_sec, p.answer_start_sec,
|
||||
snippet(qa_fts, 0, '<mark>', '</mark>', '...', 16) AS q_snippet,
|
||||
snippet(qa_fts, 1, '<mark>', '</mark>', '...', 16) AS a_snippet,
|
||||
bm25(qa_fts) AS rank
|
||||
FROM qa_fts
|
||||
JOIN qa_pairs p ON p.id = qa_fts.rowid
|
||||
JOIN episodes e ON e.id = p.episode_id
|
||||
WHERE qa_fts MATCH ?
|
||||
ORDER BY rank LIMIT ?
|
||||
""",
|
||||
(fts_q, limit),
|
||||
).fetchall()
|
||||
]
|
||||
# NULL is treated as "unscored, include" so unprocessed rows still
|
||||
# appear and old saved URLs keep working as the classifier rolls out.
|
||||
# Filters are applied as additional WHERE clauses on top of the FTS
|
||||
# MATCH; SQLite's planner can use idx_qa_usefulness once it's helpful.
|
||||
qa_clauses = ["qa_fts MATCH :q"]
|
||||
qa_params: dict[str, object] = {"q": fts_q, "limit": limit}
|
||||
if min_score > 0:
|
||||
qa_clauses.append(
|
||||
"(p.usefulness_score IS NULL OR p.usefulness_score >= :min_score)"
|
||||
)
|
||||
qa_params["min_score"] = min_score
|
||||
if exclude_banter:
|
||||
qa_clauses.append("(p.is_banter IS NULL OR p.is_banter = 0)")
|
||||
|
||||
qa_sql = f"""
|
||||
SELECT e.id AS episode_id, e.year, e.title, e.air_date,
|
||||
p.id AS qa_id, p.caller_name,
|
||||
p.question_start_sec, p.answer_start_sec,
|
||||
p.usefulness_score, p.topic_class, p.is_banter,
|
||||
snippet(qa_fts, 0, '<mark>', '</mark>', '...', 16) AS q_snippet,
|
||||
snippet(qa_fts, 1, '<mark>', '</mark>', '...', 16) AS a_snippet,
|
||||
bm25(qa_fts) AS rank
|
||||
FROM qa_fts
|
||||
JOIN qa_pairs p ON p.id = qa_fts.rowid
|
||||
JOIN episodes e ON e.id = p.episode_id
|
||||
WHERE {' AND '.join(qa_clauses)}
|
||||
ORDER BY rank LIMIT :limit
|
||||
"""
|
||||
qa_results = [dict(r) for r in db.execute(qa_sql, qa_params).fetchall()]
|
||||
|
||||
return {"q": q, "segments": seg_results, "qa": qa_results}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user