"""Stage 3: Segment detection — multi-signal commercial/show content classifier.""" import json from dataclasses import dataclass from pathlib import Path from enum import Enum import numpy as np from rich.console import Console from rich.table import Table console = Console() class SegmentType(Enum): SHOW_CONTENT = "show_content" COMMERCIAL = "commercial" SHOW_ELEMENT = "show_element" # intro, outro, bumper SILENCE = "silence" UNKNOWN = "unknown" @dataclass class DetectedSegment: start: float end: float segment_type: SegmentType confidence: float label: str = "" # "Segment 1: The Week That Was", "Commercial Break 1", etc. signals: dict = None # Individual signal scores def __post_init__(self): if self.signals is None: self.signals = {} @property def duration(self) -> float: return self.end - self.start @dataclass class SegmentDetectionResult: segments: list[DetectedSegment] show_segments: list[DetectedSegment] commercial_segments: list[DetectedSegment] element_segments: list[DetectedSegment] total_show_time: float total_commercial_time: float def to_dict(self) -> dict: return { "total_show_time": self.total_show_time, "total_commercial_time": self.total_commercial_time, "segments": [ { "start": s.start, "end": s.end, "type": s.segment_type.value, "confidence": s.confidence, "label": s.label, "signals": s.signals, } for s in self.segments ], } def save(self, output_dir: Path): output_dir.mkdir(parents=True, exist_ok=True) with open(output_dir / "detection-report.json", "w") as f: json.dump(self.to_dict(), f, indent=2) def print_summary(self): table = Table(title="Segment Detection Results") table.add_column("Time", style="cyan") table.add_column("Duration", style="magenta") table.add_column("Type", style="green") table.add_column("Confidence", style="yellow") table.add_column("Label") for seg in self.segments: start = _format_time(seg.start) dur = f"{seg.duration:.0f}s" type_style = { SegmentType.SHOW_CONTENT: "[green]SHOW[/green]", SegmentType.COMMERCIAL: "[red]COMMERCIAL[/red]", SegmentType.SHOW_ELEMENT: "[blue]ELEMENT[/blue]", SegmentType.SILENCE: "[dim]SILENCE[/dim]", SegmentType.UNKNOWN: "[yellow]UNKNOWN[/yellow]", }.get(seg.segment_type, str(seg.segment_type)) table.add_row(start, dur, type_style, f"{seg.confidence:.2f}", seg.label) console.print(table) console.print(f"\nShow content: {self.total_show_time / 60:.1f} min") console.print(f"Commercials: {self.total_commercial_time / 60:.1f} min") def _format_time(seconds: float) -> str: m = int(seconds // 60) s = int(seconds % 60) return f"{m:02d}:{s:02d}" class SegmentDetector: """Multi-signal commercial/show content detector.""" def __init__(self, config): self.config = config self.weights = config.segment_detection.weights def detect(self, audio_path: Path, transcript=None, diarization=None, show_prep=None) -> SegmentDetectionResult: """Run all detection signals and combine scores.""" console.print(f"[bold]Detecting segments:[/bold] {audio_path.name}") # Load audio for analysis audio_data, sample_rate = self._load_audio(audio_path) duration = len(audio_data) / sample_rate # Step 1: Find candidate boundaries using silence detection boundaries = self._detect_silence_boundaries(audio_data, sample_rate) console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]") # Step 2: Find hard break points from transcript (most reliable signal) transcript_breaks = [] if transcript: transcript_breaks = self._find_transcript_breaks(transcript) console.print(f"[dim]Found {len(transcript_breaks)} break cues in transcript[/dim]") # Step 3: Create segments using transcript breaks as primary boundaries, # with silence boundaries refining the exact cut points if transcript_breaks: candidates = self._create_segments_from_breaks( transcript_breaks, boundaries, audio_data, sample_rate, duration ) else: candidates = self._create_candidate_segments(boundaries, duration) # Step 4: Score each candidate with all available signals for candidate in candidates: scores = {} scores["fingerprint"] = self._score_fingerprint( audio_data, sample_rate, candidate ) if diarization: scores["speaker"] = self._score_speaker_identity( diarization, candidate ) else: scores["speaker"] = 0.5 scores["audio_chars"] = self._score_audio_characteristics( audio_data, sample_rate, candidate ) if transcript: scores["structural"] = self._score_structural( transcript, candidate ) else: scores["structural"] = 0.5 commercial_score = ( self.weights.fingerprint_match * scores.get("fingerprint", 0.5) + self.weights.speaker_identity * scores.get("speaker", 0.5) + self.weights.audio_characteristics * scores.get("audio_chars", 0.5) + self.weights.structural_heuristic * scores.get("structural", 0.5) ) candidate.signals = scores # If segment was already typed by transcript breaks, keep it if candidate.segment_type == SegmentType.UNKNOWN: candidate.confidence = commercial_score if commercial_score >= self.config.segment_detection.confidence_threshold: candidate.segment_type = SegmentType.COMMERCIAL else: candidate.segment_type = SegmentType.SHOW_CONTENT else: candidate.confidence = max(commercial_score, 0.80) # Step 5: Merge adjacent segments of same type merged = self._merge_adjacent(candidates) # Step 6: Apply duration constraints final = self._apply_constraints(merged) # Step 7: Label show segments using show prep if available if show_prep: self._label_from_prep(final, transcript, show_prep) # Build result show_segs = [s for s in final if s.segment_type == SegmentType.SHOW_CONTENT] comm_segs = [s for s in final if s.segment_type == SegmentType.COMMERCIAL] elem_segs = [s for s in final if s.segment_type == SegmentType.SHOW_ELEMENT] result = SegmentDetectionResult( segments=final, show_segments=show_segs, commercial_segments=comm_segs, element_segments=elem_segs, total_show_time=sum(s.duration for s in show_segs), total_commercial_time=sum(s.duration for s in comm_segs), ) result.print_summary() return result def _load_audio(self, audio_path: Path) -> tuple[np.ndarray, int]: """Load audio file as mono numpy array.""" import subprocess import io import struct # Use ffmpeg to decode to raw PCM result = subprocess.run( ["ffmpeg", "-i", str(audio_path), "-f", "s16le", "-ac", "1", "-ar", "16000", "-"], capture_output=True, timeout=300, ) audio = np.frombuffer(result.stdout, dtype=np.int16).astype(np.float32) / 32768.0 return audio, 16000 def _detect_silence_boundaries(self, audio: np.ndarray, sr: int, min_silence_ms: int = 500) -> list[float]: """Detect silence gaps in audio that likely indicate segment boundaries.""" frame_size = int(sr * 0.025) # 25ms frames hop_size = int(sr * 0.010) # 10ms hop threshold_db = self.config.segment_detection.silence_threshold_db threshold_amp = 10 ** (threshold_db / 20) min_silence_frames = int(min_silence_ms / 10) # Calculate frame energy energies = [] for i in range(0, len(audio) - frame_size, hop_size): frame = audio[i:i + frame_size] rms = np.sqrt(np.mean(frame ** 2)) energies.append(rms) # Find silence regions is_silent = [e < threshold_amp for e in energies] boundaries = [] silent_count = 0 for i, silent in enumerate(is_silent): if silent: silent_count += 1 else: if silent_count >= min_silence_frames: # Mark the midpoint of the silence as a boundary mid_frame = i - silent_count // 2 boundary_time = mid_frame * 0.010 boundaries.append(boundary_time) silent_count = 0 return boundaries def _find_transcript_breaks(self, transcript) -> list[dict]: """Find commercial break points from transcript content.""" break_cues = [] going_to_break = [ "take a quick break", "take a break", "go to commercial", "going to break", "let's go to break", "we'll be right back", "right back after", "news break coming up", "after the news", "be right back", "stay tuned", "don't go anywhere", ] coming_back = [ "welcome back", "we're back", "we are back", "back from the break", "back from break", "back on the", "back with you", ] for seg in transcript.segments: text = seg.text.lower().strip() for cue in going_to_break: if cue in text: break_cues.append({ "type": "break_start", "time": seg.end, "text": seg.text.strip(), "cue": cue, }) break for cue in coming_back: if cue in text: break_cues.append({ "type": "break_end", "time": seg.start, "text": seg.text.strip(), "cue": cue, }) break return break_cues def _create_segments_from_breaks(self, transcript_breaks: list[dict], silence_boundaries: list[float], audio: np.ndarray, sr: int, total_duration: float) -> list[DetectedSegment]: """Create segments using transcript break cues as primary boundaries. For each break_start, find the nearest silence boundary after it (exact cut point). For each break_end, find the nearest silence boundary before it. The gap between break_start and break_end = commercial break. """ segments = [] # Pair up break_start with the next break_end break_regions = [] i = 0 while i < len(transcript_breaks): cue = transcript_breaks[i] if cue["type"] == "break_start": # Find the matching break_end end_time = None for j in range(i + 1, len(transcript_breaks)): if transcript_breaks[j]["type"] == "break_end": end_time = transcript_breaks[j]["time"] i = j + 1 break if end_time is None: # No matching end — assume break lasts until a reasonable point # (5 minutes max, or until end of audio) end_time = min(cue["time"] + 300, total_duration) i += 1 # Snap to nearest silence boundaries for clean cuts start = self._nearest_silence(cue["time"], silence_boundaries, after=True) end = self._nearest_silence(end_time, silence_boundaries, after=False) if start and end and end > start: break_regions.append((start, end)) elif start: break_regions.append((start, end_time)) else: i += 1 if not break_regions: return self._create_candidate_segments(silence_boundaries, total_duration) # Build segments: show → commercial → show → commercial → ... prev_end = 0.0 for break_start, break_end in break_regions: # Show content before this break if break_start - prev_end > 1.0: segments.append(DetectedSegment( start=prev_end, end=break_start, segment_type=SegmentType.SHOW_CONTENT, confidence=0.85, label="", )) # Commercial break segments.append(DetectedSegment( start=break_start, end=break_end, segment_type=SegmentType.COMMERCIAL, confidence=0.85, label="", )) prev_end = break_end # Final show segment after last break if total_duration - prev_end > 1.0: segments.append(DetectedSegment( start=prev_end, end=total_duration, segment_type=SegmentType.SHOW_CONTENT, confidence=0.85, label="", )) return segments def _nearest_silence(self, time: float, boundaries: list[float], after: bool = True, max_distance: float = 10.0) -> float | None: """Find the nearest silence boundary to a given time.""" best = None best_dist = max_distance for b in boundaries: dist = abs(b - time) if dist > max_distance: continue if after and b >= time and dist < best_dist: best = b best_dist = dist elif not after and b <= time and dist < best_dist: best = b best_dist = dist return best def _create_candidate_segments(self, boundaries: list[float], total_duration: float) -> list[DetectedSegment]: """Create candidate segments from silence boundaries.""" candidates = [] prev = 0.0 for boundary in boundaries: if boundary - prev > 1.0: # Ignore segments < 1 second candidates.append(DetectedSegment( start=prev, end=boundary, segment_type=SegmentType.UNKNOWN, confidence=0.0, )) prev = boundary # Final segment if total_duration - prev > 1.0: candidates.append(DetectedSegment( start=prev, end=total_duration, segment_type=SegmentType.UNKNOWN, confidence=0.0, )) return candidates def _score_fingerprint(self, audio: np.ndarray, sr: int, segment: DetectedSegment) -> float: """Score based on audio fingerprint matching against element library. Returns 0.0 (no match / definitely show) to 1.0 (definite commercial boundary). """ # TODO: Implement fingerprint matching against element-library/fingerprints.db # For now, return neutral score return 0.5 def _score_speaker_identity(self, diarization, segment: DetectedSegment) -> float: """Score based on whether the host is speaking. Returns 0.0 (host definitely speaking = show content) to 1.0 (host definitely absent = likely commercial). """ host_time = 0.0 total_time = segment.duration for turn in diarization.turns: if turn.end < segment.start or turn.start > segment.end: continue # Calculate overlap overlap_start = max(turn.start, segment.start) overlap_end = min(turn.end, segment.end) overlap = max(0, overlap_end - overlap_start) if "host" in turn.speaker.lower(): host_time += overlap if total_time == 0: return 0.5 host_fraction = host_time / total_time # Invert: high host presence = low commercial score return 1.0 - host_fraction def _score_audio_characteristics(self, audio: np.ndarray, sr: int, segment: DetectedSegment) -> float: """Score based on audio production characteristics. Commercials tend to be louder, more compressed, different spectral profile. Returns 0.0 (matches show characteristics) to 1.0 (matches commercial characteristics). """ start_sample = int(segment.start * sr) end_sample = min(int(segment.end * sr), len(audio)) seg_audio = audio[start_sample:end_sample] if len(seg_audio) < sr: # Less than 1 second return 0.5 # RMS energy (commercials tend to be louder) rms = np.sqrt(np.mean(seg_audio ** 2)) # Dynamic range (commercials tend to be more compressed) frame_size = int(sr * 0.050) # 50ms frames frame_rms = [] for i in range(0, len(seg_audio) - frame_size, frame_size): frame = seg_audio[i:i + frame_size] frame_rms.append(np.sqrt(np.mean(frame ** 2))) if not frame_rms: return 0.5 dynamic_range = max(frame_rms) / (min(frame_rms) + 1e-8) # Simple heuristic scoring: # High RMS + low dynamic range = compressed commercial audio score = 0.5 if rms > 0.15: # Louder than typical speech score += 0.15 if dynamic_range < 5.0: # Very compressed score += 0.15 return min(1.0, max(0.0, score)) def _score_structural(self, transcript, segment: DetectedSegment) -> float: """Score based on transcript content structural cues. Returns 0.0 (show content cues found) to 1.0 (commercial cues found). """ text = transcript.text_at(segment.start, segment.end).lower() # Show content indicators show_phrases = [ "welcome back", "let's move on", "next up", "our next topic", "let's talk about", "as i mentioned", "the question is", "caller", "what do you think", "here's the thing", ] # Commercial/break indicators break_phrases = [ "we'll be right back", "stay tuned", "don't go anywhere", "after the break", "when we come back", ] show_hits = sum(1 for p in show_phrases if p in text) break_hits = sum(1 for p in break_phrases if p in text) if show_hits > 0 and break_hits == 0: return 0.2 # Likely show content if break_hits > 0: return 0.8 # Likely near a break return 0.5 # Neutral def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]: """Merge adjacent and overlapping segments of the same type.""" if not segments: return [] # Sort by start time first segments.sort(key=lambda s: s.start) merged = [segments[0]] for seg in segments[1:]: prev = merged[-1] # Merge if same type AND (overlapping or within 2 seconds) if (prev.segment_type == seg.segment_type and seg.start <= prev.end + 2.0): prev.end = max(prev.end, seg.end) prev.confidence = (prev.confidence + seg.confidence) / 2 else: merged.append(seg) return merged def _apply_constraints(self, segments: list[DetectedSegment]) -> list[DetectedSegment]: """Apply duration constraints — short 'commercial' segments are likely misclassified.""" min_break = self.config.segment_detection.min_break_duration_s for seg in segments: if (seg.segment_type == SegmentType.COMMERCIAL and seg.duration < min_break): seg.segment_type = SegmentType.SHOW_CONTENT seg.label = "(reclassified: too short for commercial)" return segments def _label_from_prep(self, segments: list[DetectedSegment], transcript, show_prep: str): """Label show segments by matching transcript content to show prep topics.""" # TODO: Use Ollama to match transcript sections against show prep segment titles # For now, number them sequentially show_count = 0 comm_count = 0 for seg in segments: if seg.segment_type == SegmentType.SHOW_CONTENT: show_count += 1 seg.label = f"Show Segment {show_count}" elif seg.segment_type == SegmentType.COMMERCIAL: comm_count += 1 seg.label = f"Commercial Break {comm_count}"