Add radio show audio processor and post-show workflow

- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU),
  diarize (pyannote), detect segments (multi-signal classifier), remove commercials,
  split segments, analyze content (Ollama)
- Post-show workflow doc for episode posts, forum threads, deep-dive blog posts
- Training plan for using 579-episode archive for voice profiles and commercial detection
- Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti
- Sample transcript output from S7E30 (March 2015)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-21 11:51:59 -07:00
parent a8c8c6b7b6
commit a1e0442d8b
17 changed files with 58344 additions and 0 deletions

View File

@@ -0,0 +1,419 @@
"""Stage 3: Segment detection — multi-signal commercial/show content classifier."""
import json
from dataclasses import dataclass
from pathlib import Path
from enum import Enum
import numpy as np
from rich.console import Console
from rich.table import Table
console = Console()
class SegmentType(Enum):
SHOW_CONTENT = "show_content"
COMMERCIAL = "commercial"
SHOW_ELEMENT = "show_element" # intro, outro, bumper
SILENCE = "silence"
UNKNOWN = "unknown"
@dataclass
class DetectedSegment:
start: float
end: float
segment_type: SegmentType
confidence: float
label: str = "" # "Segment 1: The Week That Was", "Commercial Break 1", etc.
signals: dict = None # Individual signal scores
def __post_init__(self):
if self.signals is None:
self.signals = {}
@property
def duration(self) -> float:
return self.end - self.start
@dataclass
class SegmentDetectionResult:
segments: list[DetectedSegment]
show_segments: list[DetectedSegment]
commercial_segments: list[DetectedSegment]
element_segments: list[DetectedSegment]
total_show_time: float
total_commercial_time: float
def to_dict(self) -> dict:
return {
"total_show_time": self.total_show_time,
"total_commercial_time": self.total_commercial_time,
"segments": [
{
"start": s.start,
"end": s.end,
"type": s.segment_type.value,
"confidence": s.confidence,
"label": s.label,
"signals": s.signals,
}
for s in self.segments
],
}
def save(self, output_dir: Path):
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "detection-report.json", "w") as f:
json.dump(self.to_dict(), f, indent=2)
def print_summary(self):
table = Table(title="Segment Detection Results")
table.add_column("Time", style="cyan")
table.add_column("Duration", style="magenta")
table.add_column("Type", style="green")
table.add_column("Confidence", style="yellow")
table.add_column("Label")
for seg in self.segments:
start = _format_time(seg.start)
dur = f"{seg.duration:.0f}s"
type_style = {
SegmentType.SHOW_CONTENT: "[green]SHOW[/green]",
SegmentType.COMMERCIAL: "[red]COMMERCIAL[/red]",
SegmentType.SHOW_ELEMENT: "[blue]ELEMENT[/blue]",
SegmentType.SILENCE: "[dim]SILENCE[/dim]",
SegmentType.UNKNOWN: "[yellow]UNKNOWN[/yellow]",
}.get(seg.segment_type, str(seg.segment_type))
table.add_row(start, dur, type_style, f"{seg.confidence:.2f}", seg.label)
console.print(table)
console.print(f"\nShow content: {self.total_show_time / 60:.1f} min")
console.print(f"Commercials: {self.total_commercial_time / 60:.1f} min")
def _format_time(seconds: float) -> str:
m = int(seconds // 60)
s = int(seconds % 60)
return f"{m:02d}:{s:02d}"
class SegmentDetector:
"""Multi-signal commercial/show content detector."""
def __init__(self, config):
self.config = config
self.weights = config.segment_detection.weights
def detect(self, audio_path: Path, transcript=None, diarization=None,
show_prep=None) -> SegmentDetectionResult:
"""Run all detection signals and combine scores."""
console.print(f"[bold]Detecting segments:[/bold] {audio_path.name}")
# Load audio for analysis
audio_data, sample_rate = self._load_audio(audio_path)
duration = len(audio_data) / sample_rate
# Step 1: Find candidate boundaries using silence detection
boundaries = self._detect_silence_boundaries(audio_data, sample_rate)
console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]")
# Step 2: Create candidate segments between boundaries
candidates = self._create_candidate_segments(boundaries, duration)
# Step 3: Score each candidate with all available signals
for candidate in candidates:
scores = {}
# Signal 1: Fingerprint matching (if library available)
scores["fingerprint"] = self._score_fingerprint(
audio_data, sample_rate, candidate
)
# Signal 2: Speaker identity
if diarization:
scores["speaker"] = self._score_speaker_identity(
diarization, candidate
)
else:
scores["speaker"] = 0.5 # neutral
# Signal 3: Audio characteristics
scores["audio_chars"] = self._score_audio_characteristics(
audio_data, sample_rate, candidate
)
# Signal 4: Structural heuristics
if transcript:
scores["structural"] = self._score_structural(
transcript, candidate
)
else:
scores["structural"] = 0.5
# Combined weighted score (higher = more likely commercial)
commercial_score = (
self.weights.fingerprint_match * scores.get("fingerprint", 0.5) +
self.weights.speaker_identity * scores.get("speaker", 0.5) +
self.weights.audio_characteristics * scores.get("audio_chars", 0.5) +
self.weights.structural_heuristic * scores.get("structural", 0.5)
)
candidate.signals = scores
candidate.confidence = commercial_score
if commercial_score >= self.config.segment_detection.confidence_threshold:
candidate.segment_type = SegmentType.COMMERCIAL
else:
candidate.segment_type = SegmentType.SHOW_CONTENT
# Step 4: Merge adjacent segments of same type
merged = self._merge_adjacent(candidates)
# Step 5: Apply duration constraints
final = self._apply_constraints(merged)
# Step 6: Label show segments using show prep if available
if show_prep:
self._label_from_prep(final, transcript, show_prep)
# Build result
show_segs = [s for s in final if s.segment_type == SegmentType.SHOW_CONTENT]
comm_segs = [s for s in final if s.segment_type == SegmentType.COMMERCIAL]
elem_segs = [s for s in final if s.segment_type == SegmentType.SHOW_ELEMENT]
result = SegmentDetectionResult(
segments=final,
show_segments=show_segs,
commercial_segments=comm_segs,
element_segments=elem_segs,
total_show_time=sum(s.duration for s in show_segs),
total_commercial_time=sum(s.duration for s in comm_segs),
)
result.print_summary()
return result
def _load_audio(self, audio_path: Path) -> tuple[np.ndarray, int]:
"""Load audio file as mono numpy array."""
import subprocess
import io
import struct
# Use ffmpeg to decode to raw PCM
result = subprocess.run(
["ffmpeg", "-i", str(audio_path), "-f", "s16le", "-ac", "1",
"-ar", "16000", "-"],
capture_output=True, timeout=300,
)
audio = np.frombuffer(result.stdout, dtype=np.int16).astype(np.float32) / 32768.0
return audio, 16000
def _detect_silence_boundaries(self, audio: np.ndarray, sr: int,
min_silence_ms: int = 500) -> list[float]:
"""Detect silence gaps in audio that likely indicate segment boundaries."""
frame_size = int(sr * 0.025) # 25ms frames
hop_size = int(sr * 0.010) # 10ms hop
threshold_db = self.config.segment_detection.silence_threshold_db
threshold_amp = 10 ** (threshold_db / 20)
min_silence_frames = int(min_silence_ms / 10)
# Calculate frame energy
energies = []
for i in range(0, len(audio) - frame_size, hop_size):
frame = audio[i:i + frame_size]
rms = np.sqrt(np.mean(frame ** 2))
energies.append(rms)
# Find silence regions
is_silent = [e < threshold_amp for e in energies]
boundaries = []
silent_count = 0
for i, silent in enumerate(is_silent):
if silent:
silent_count += 1
else:
if silent_count >= min_silence_frames:
# Mark the midpoint of the silence as a boundary
mid_frame = i - silent_count // 2
boundary_time = mid_frame * 0.010
boundaries.append(boundary_time)
silent_count = 0
return boundaries
def _create_candidate_segments(self, boundaries: list[float],
total_duration: float) -> list[DetectedSegment]:
"""Create candidate segments from silence boundaries."""
candidates = []
prev = 0.0
for boundary in boundaries:
if boundary - prev > 1.0: # Ignore segments < 1 second
candidates.append(DetectedSegment(
start=prev,
end=boundary,
segment_type=SegmentType.UNKNOWN,
confidence=0.0,
))
prev = boundary
# Final segment
if total_duration - prev > 1.0:
candidates.append(DetectedSegment(
start=prev,
end=total_duration,
segment_type=SegmentType.UNKNOWN,
confidence=0.0,
))
return candidates
def _score_fingerprint(self, audio: np.ndarray, sr: int,
segment: DetectedSegment) -> float:
"""Score based on audio fingerprint matching against element library.
Returns 0.0 (no match / definitely show) to 1.0 (definite commercial boundary).
"""
# TODO: Implement fingerprint matching against element-library/fingerprints.db
# For now, return neutral score
return 0.5
def _score_speaker_identity(self, diarization, segment: DetectedSegment) -> float:
"""Score based on whether the host is speaking.
Returns 0.0 (host definitely speaking = show content)
to 1.0 (host definitely absent = likely commercial).
"""
host_time = 0.0
total_time = segment.duration
for turn in diarization.turns:
if turn.end < segment.start or turn.start > segment.end:
continue
# Calculate overlap
overlap_start = max(turn.start, segment.start)
overlap_end = min(turn.end, segment.end)
overlap = max(0, overlap_end - overlap_start)
if "host" in turn.speaker.lower():
host_time += overlap
if total_time == 0:
return 0.5
host_fraction = host_time / total_time
# Invert: high host presence = low commercial score
return 1.0 - host_fraction
def _score_audio_characteristics(self, audio: np.ndarray, sr: int,
segment: DetectedSegment) -> float:
"""Score based on audio production characteristics.
Commercials tend to be louder, more compressed, different spectral profile.
Returns 0.0 (matches show characteristics) to 1.0 (matches commercial characteristics).
"""
start_sample = int(segment.start * sr)
end_sample = min(int(segment.end * sr), len(audio))
seg_audio = audio[start_sample:end_sample]
if len(seg_audio) < sr: # Less than 1 second
return 0.5
# RMS energy (commercials tend to be louder)
rms = np.sqrt(np.mean(seg_audio ** 2))
# Dynamic range (commercials tend to be more compressed)
frame_size = int(sr * 0.050) # 50ms frames
frame_rms = []
for i in range(0, len(seg_audio) - frame_size, frame_size):
frame = seg_audio[i:i + frame_size]
frame_rms.append(np.sqrt(np.mean(frame ** 2)))
if not frame_rms:
return 0.5
dynamic_range = max(frame_rms) / (min(frame_rms) + 1e-8)
# Simple heuristic scoring:
# High RMS + low dynamic range = compressed commercial audio
score = 0.5
if rms > 0.15: # Louder than typical speech
score += 0.15
if dynamic_range < 5.0: # Very compressed
score += 0.15
return min(1.0, max(0.0, score))
def _score_structural(self, transcript, segment: DetectedSegment) -> float:
"""Score based on transcript content structural cues.
Returns 0.0 (show content cues found) to 1.0 (commercial cues found).
"""
text = transcript.text_at(segment.start, segment.end).lower()
# Show content indicators
show_phrases = [
"welcome back", "let's move on", "next up", "our next topic",
"let's talk about", "as i mentioned", "the question is",
"caller", "what do you think", "here's the thing",
]
# Commercial/break indicators
break_phrases = [
"we'll be right back", "stay tuned", "don't go anywhere",
"after the break", "when we come back",
]
show_hits = sum(1 for p in show_phrases if p in text)
break_hits = sum(1 for p in break_phrases if p in text)
if show_hits > 0 and break_hits == 0:
return 0.2 # Likely show content
if break_hits > 0:
return 0.8 # Likely near a break
return 0.5 # Neutral
def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
"""Merge adjacent segments of the same type."""
if not segments:
return []
merged = [segments[0]]
for seg in segments[1:]:
prev = merged[-1]
if (prev.segment_type == seg.segment_type and
abs(seg.start - prev.end) < 2.0): # Within 2 seconds
# Extend previous segment
prev.end = seg.end
prev.confidence = (prev.confidence + seg.confidence) / 2
else:
merged.append(seg)
return merged
def _apply_constraints(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
"""Apply duration constraints — short 'commercial' segments are likely misclassified."""
min_break = self.config.segment_detection.min_break_duration_s
for seg in segments:
if (seg.segment_type == SegmentType.COMMERCIAL and
seg.duration < min_break):
seg.segment_type = SegmentType.SHOW_CONTENT
seg.label = "(reclassified: too short for commercial)"
return segments
def _label_from_prep(self, segments: list[DetectedSegment],
transcript, show_prep: str):
"""Label show segments by matching transcript content to show prep topics."""
# TODO: Use Ollama to match transcript sections against show prep segment titles
# For now, number them sequentially
show_count = 0
comm_count = 0
for seg in segments:
if seg.segment_type == SegmentType.SHOW_CONTENT:
show_count += 1
seg.label = f"Show Segment {show_count}"
elif seg.segment_type == SegmentType.COMMERCIAL:
comm_count += 1
seg.label = f"Commercial Break {comm_count}"