From 87f5a9306a25d4f8f50df325c8d444f18191d294 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sat, 21 Mar 2026 11:59:54 -0700 Subject: [PATCH] Audio processor: fix segment detection with transcript-driven breaks - Add transcript break phrase detection (going_to_break/coming_back cues) - Create segments from transcript breaks with silence boundary snapping - Fix segment dedup in merge_adjacent (handle overlapping segments) - Add CUDA 12 library path fix (gpu.py + venv activate hook) - Auto-load existing transcript in detect command - Tested on 2011-03-05 HR1: correctly identifies commercial break at 34:38 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../radio-show/audio-processor/src/cli.py | 29 ++- .../radio-show/audio-processor/src/gpu.py | 17 ++ .../audio-processor/src/segment_detector.py | 190 ++++++++++++++++-- 3 files changed, 215 insertions(+), 21 deletions(-) create mode 100644 projects/radio-show/audio-processor/src/gpu.py diff --git a/projects/radio-show/audio-processor/src/cli.py b/projects/radio-show/audio-processor/src/cli.py index 678c8d3..ca41eb9 100644 --- a/projects/radio-show/audio-processor/src/cli.py +++ b/projects/radio-show/audio-processor/src/cli.py @@ -1,5 +1,9 @@ """CLI entry point for the radio show audio processor.""" +# Must set CUDA paths before any torch/ctranslate2 imports +from .gpu import ensure_cuda_libs +ensure_cuda_libs() + import argparse import sys from pathlib import Path @@ -274,8 +278,31 @@ def _cmd_detect(args, config): if args.show_prep: show_prep = Path(args.show_prep).read_text() + # Load existing transcript if available + transcript = None + transcript_file = output_dir / "transcript.json" + if transcript_file.exists(): + from .transcriber import Transcript, TranscriptSegment, TranscriptWord + import json + console.print(f"[dim]Loading transcript from {transcript_file}[/dim]") + with open(transcript_file) as f: + data = json.load(f) + transcript = Transcript( + segments=[ + TranscriptSegment( + id=s["id"], text=s["text"], + start=s["start"], end=s["end"], + words=[TranscriptWord(**w) for w in s.get("words", [])], + ) + for s in data["segments"] + ], + language=data["language"], + language_probability=data["language_probability"], + duration=data["duration"], + ) + detector = SegmentDetector(config) - result = detector.detect(audio_path, show_prep=show_prep) + result = detector.detect(audio_path, transcript=transcript, show_prep=show_prep) result.save(output_dir) diff --git a/projects/radio-show/audio-processor/src/gpu.py b/projects/radio-show/audio-processor/src/gpu.py new file mode 100644 index 0000000..c1f0e95 --- /dev/null +++ b/projects/radio-show/audio-processor/src/gpu.py @@ -0,0 +1,17 @@ +"""GPU and CUDA library setup for the audio processor.""" + +import os +from pathlib import Path + + +def ensure_cuda_libs(): + """Ensure CUDA 12 libraries are on LD_LIBRARY_PATH. + + The system has CUDA 13.2 but faster-whisper's ctranslate2 needs CUDA 12. + Ollama ships CUDA 12 libs at /usr/local/lib/ollama/cuda_v12/. + """ + cuda12_path = "/usr/local/lib/ollama/cuda_v12" + if Path(cuda12_path).exists(): + current = os.environ.get("LD_LIBRARY_PATH", "") + if cuda12_path not in current: + os.environ["LD_LIBRARY_PATH"] = f"{cuda12_path}:{current}" if current else cuda12_path diff --git a/projects/radio-show/audio-processor/src/segment_detector.py b/projects/radio-show/audio-processor/src/segment_detector.py index 5f57e22..120fc15 100644 --- a/projects/radio-show/audio-processor/src/segment_detector.py +++ b/projects/radio-show/audio-processor/src/segment_detector.py @@ -121,32 +121,40 @@ class SegmentDetector: boundaries = self._detect_silence_boundaries(audio_data, sample_rate) console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]") - # Step 2: Create candidate segments between boundaries - candidates = self._create_candidate_segments(boundaries, duration) + # Step 2: Find hard break points from transcript (most reliable signal) + transcript_breaks = [] + if transcript: + transcript_breaks = self._find_transcript_breaks(transcript) + console.print(f"[dim]Found {len(transcript_breaks)} break cues in transcript[/dim]") - # Step 3: Score each candidate with all available signals + # Step 3: Create segments using transcript breaks as primary boundaries, + # with silence boundaries refining the exact cut points + if transcript_breaks: + candidates = self._create_segments_from_breaks( + transcript_breaks, boundaries, audio_data, sample_rate, duration + ) + else: + candidates = self._create_candidate_segments(boundaries, duration) + + # Step 4: Score each candidate with all available signals for candidate in candidates: scores = {} - # Signal 1: Fingerprint matching (if library available) scores["fingerprint"] = self._score_fingerprint( audio_data, sample_rate, candidate ) - # Signal 2: Speaker identity if diarization: scores["speaker"] = self._score_speaker_identity( diarization, candidate ) else: - scores["speaker"] = 0.5 # neutral + scores["speaker"] = 0.5 - # Signal 3: Audio characteristics scores["audio_chars"] = self._score_audio_characteristics( audio_data, sample_rate, candidate ) - # Signal 4: Structural heuristics if transcript: scores["structural"] = self._score_structural( transcript, candidate @@ -154,7 +162,6 @@ class SegmentDetector: else: scores["structural"] = 0.5 - # Combined weighted score (higher = more likely commercial) commercial_score = ( self.weights.fingerprint_match * scores.get("fingerprint", 0.5) + self.weights.speaker_identity * scores.get("speaker", 0.5) + @@ -163,20 +170,24 @@ class SegmentDetector: ) candidate.signals = scores - candidate.confidence = commercial_score - if commercial_score >= self.config.segment_detection.confidence_threshold: - candidate.segment_type = SegmentType.COMMERCIAL + # If segment was already typed by transcript breaks, keep it + if candidate.segment_type == SegmentType.UNKNOWN: + candidate.confidence = commercial_score + if commercial_score >= self.config.segment_detection.confidence_threshold: + candidate.segment_type = SegmentType.COMMERCIAL + else: + candidate.segment_type = SegmentType.SHOW_CONTENT else: - candidate.segment_type = SegmentType.SHOW_CONTENT + candidate.confidence = max(commercial_score, 0.80) - # Step 4: Merge adjacent segments of same type + # Step 5: Merge adjacent segments of same type merged = self._merge_adjacent(candidates) - # Step 5: Apply duration constraints + # Step 6: Apply duration constraints final = self._apply_constraints(merged) - # Step 6: Label show segments using show prep if available + # Step 7: Label show segments using show prep if available if show_prep: self._label_from_prep(final, transcript, show_prep) @@ -246,6 +257,142 @@ class SegmentDetector: return boundaries + def _find_transcript_breaks(self, transcript) -> list[dict]: + """Find commercial break points from transcript content.""" + break_cues = [] + going_to_break = [ + "take a quick break", "take a break", "go to commercial", + "going to break", "let's go to break", "we'll be right back", + "right back after", "news break coming up", "after the news", + "be right back", "stay tuned", "don't go anywhere", + ] + coming_back = [ + "welcome back", "we're back", "we are back", "back from the break", + "back from break", "back on the", "back with you", + ] + + for seg in transcript.segments: + text = seg.text.lower().strip() + for cue in going_to_break: + if cue in text: + break_cues.append({ + "type": "break_start", + "time": seg.end, + "text": seg.text.strip(), + "cue": cue, + }) + break + for cue in coming_back: + if cue in text: + break_cues.append({ + "type": "break_end", + "time": seg.start, + "text": seg.text.strip(), + "cue": cue, + }) + break + + return break_cues + + def _create_segments_from_breaks(self, transcript_breaks: list[dict], + silence_boundaries: list[float], + audio: np.ndarray, sr: int, + total_duration: float) -> list[DetectedSegment]: + """Create segments using transcript break cues as primary boundaries. + + For each break_start, find the nearest silence boundary after it (exact cut point). + For each break_end, find the nearest silence boundary before it. + The gap between break_start and break_end = commercial break. + """ + segments = [] + + # Pair up break_start with the next break_end + break_regions = [] + i = 0 + while i < len(transcript_breaks): + cue = transcript_breaks[i] + if cue["type"] == "break_start": + # Find the matching break_end + end_time = None + for j in range(i + 1, len(transcript_breaks)): + if transcript_breaks[j]["type"] == "break_end": + end_time = transcript_breaks[j]["time"] + i = j + 1 + break + if end_time is None: + # No matching end — assume break lasts until a reasonable point + # (5 minutes max, or until end of audio) + end_time = min(cue["time"] + 300, total_duration) + i += 1 + + # Snap to nearest silence boundaries for clean cuts + start = self._nearest_silence(cue["time"], silence_boundaries, after=True) + end = self._nearest_silence(end_time, silence_boundaries, after=False) + + if start and end and end > start: + break_regions.append((start, end)) + elif start: + break_regions.append((start, end_time)) + else: + i += 1 + + if not break_regions: + return self._create_candidate_segments(silence_boundaries, total_duration) + + # Build segments: show → commercial → show → commercial → ... + prev_end = 0.0 + for break_start, break_end in break_regions: + # Show content before this break + if break_start - prev_end > 1.0: + segments.append(DetectedSegment( + start=prev_end, + end=break_start, + segment_type=SegmentType.SHOW_CONTENT, + confidence=0.85, + label="", + )) + + # Commercial break + segments.append(DetectedSegment( + start=break_start, + end=break_end, + segment_type=SegmentType.COMMERCIAL, + confidence=0.85, + label="", + )) + prev_end = break_end + + # Final show segment after last break + if total_duration - prev_end > 1.0: + segments.append(DetectedSegment( + start=prev_end, + end=total_duration, + segment_type=SegmentType.SHOW_CONTENT, + confidence=0.85, + label="", + )) + + return segments + + def _nearest_silence(self, time: float, boundaries: list[float], + after: bool = True, max_distance: float = 10.0) -> float | None: + """Find the nearest silence boundary to a given time.""" + best = None + best_dist = max_distance + + for b in boundaries: + dist = abs(b - time) + if dist > max_distance: + continue + if after and b >= time and dist < best_dist: + best = b + best_dist = dist + elif not after and b <= time and dist < best_dist: + best = b + best_dist = dist + + return best + def _create_candidate_segments(self, boundaries: list[float], total_duration: float) -> list[DetectedSegment]: """Create candidate segments from silence boundaries.""" @@ -374,17 +521,20 @@ class SegmentDetector: return 0.5 # Neutral def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]: - """Merge adjacent segments of the same type.""" + """Merge adjacent and overlapping segments of the same type.""" if not segments: return [] + # Sort by start time first + segments.sort(key=lambda s: s.start) + merged = [segments[0]] for seg in segments[1:]: prev = merged[-1] + # Merge if same type AND (overlapping or within 2 seconds) if (prev.segment_type == seg.segment_type and - abs(seg.start - prev.end) < 2.0): # Within 2 seconds - # Extend previous segment - prev.end = seg.end + seg.start <= prev.end + 2.0): + prev.end = max(prev.end, seg.end) prev.confidence = (prev.confidence + seg.confidence) / 2 else: merged.append(seg)