From 87f5a9306a25d4f8f50df325c8d444f18191d294 Mon Sep 17 00:00:00 2001
From: Mike Swanson <mike@azcomputerguru.com>
Date: Sat, 21 Mar 2026 11:59:54 -0700
Subject: [PATCH] Audio processor: fix segment detection with transcript-driven
 breaks

- Add transcript break phrase detection (going_to_break/coming_back cues)
- Create segments from transcript breaks with silence boundary snapping
- Fix segment dedup in merge_adjacent (handle overlapping segments)
- Add CUDA 12 library path fix (gpu.py + venv activate hook)
- Auto-load existing transcript in detect command
- Tested on 2011-03-05 HR1: correctly identifies commercial break at 34:38

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../radio-show/audio-processor/src/cli.py     |  29 ++-
 .../radio-show/audio-processor/src/gpu.py     |  17 ++
 .../audio-processor/src/segment_detector.py   | 190 ++++++++++++++++--
 3 files changed, 215 insertions(+), 21 deletions(-)
 create mode 100644 projects/radio-show/audio-processor/src/gpu.py

diff --git a/projects/radio-show/audio-processor/src/cli.py b/projects/radio-show/audio-processor/src/cli.py
index 678c8d3..ca41eb9 100644
--- a/projects/radio-show/audio-processor/src/cli.py
+++ b/projects/radio-show/audio-processor/src/cli.py
@@ -1,5 +1,9 @@
 """CLI entry point for the radio show audio processor."""
 
+# Must set CUDA paths before any torch/ctranslate2 imports
+from .gpu import ensure_cuda_libs
+ensure_cuda_libs()
+
 import argparse
 import sys
 from pathlib import Path
@@ -274,8 +278,31 @@ def _cmd_detect(args, config):
     if args.show_prep:
         show_prep = Path(args.show_prep).read_text()
 
+    # Load existing transcript if available
+    transcript = None
+    transcript_file = output_dir / "transcript.json"
+    if transcript_file.exists():
+        from .transcriber import Transcript, TranscriptSegment, TranscriptWord
+        import json
+        console.print(f"[dim]Loading transcript from {transcript_file}[/dim]")
+        with open(transcript_file) as f:
+            data = json.load(f)
+        transcript = Transcript(
+            segments=[
+                TranscriptSegment(
+                    id=s["id"], text=s["text"],
+                    start=s["start"], end=s["end"],
+                    words=[TranscriptWord(**w) for w in s.get("words", [])],
+                )
+                for s in data["segments"]
+            ],
+            language=data["language"],
+            language_probability=data["language_probability"],
+            duration=data["duration"],
+        )
+
     detector = SegmentDetector(config)
-    result = detector.detect(audio_path, show_prep=show_prep)
+    result = detector.detect(audio_path, transcript=transcript, show_prep=show_prep)
     result.save(output_dir)
 
 
diff --git a/projects/radio-show/audio-processor/src/gpu.py b/projects/radio-show/audio-processor/src/gpu.py
new file mode 100644
index 0000000..c1f0e95
--- /dev/null
+++ b/projects/radio-show/audio-processor/src/gpu.py
@@ -0,0 +1,17 @@
+"""GPU and CUDA library setup for the audio processor."""
+
+import os
+from pathlib import Path
+
+
+def ensure_cuda_libs():
+    """Ensure CUDA 12 libraries are on LD_LIBRARY_PATH.
+
+    The system has CUDA 13.2 but faster-whisper's ctranslate2 needs CUDA 12.
+    Ollama ships CUDA 12 libs at /usr/local/lib/ollama/cuda_v12/.
+    """
+    cuda12_path = "/usr/local/lib/ollama/cuda_v12"
+    if Path(cuda12_path).exists():
+        current = os.environ.get("LD_LIBRARY_PATH", "")
+        if cuda12_path not in current:
+            os.environ["LD_LIBRARY_PATH"] = f"{cuda12_path}:{current}" if current else cuda12_path
diff --git a/projects/radio-show/audio-processor/src/segment_detector.py b/projects/radio-show/audio-processor/src/segment_detector.py
index 5f57e22..120fc15 100644
--- a/projects/radio-show/audio-processor/src/segment_detector.py
+++ b/projects/radio-show/audio-processor/src/segment_detector.py
@@ -121,32 +121,40 @@ class SegmentDetector:
         boundaries = self._detect_silence_boundaries(audio_data, sample_rate)
         console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]")
 
-        # Step 2: Create candidate segments between boundaries
-        candidates = self._create_candidate_segments(boundaries, duration)
+        # Step 2: Find hard break points from transcript (most reliable signal)
+        transcript_breaks = []
+        if transcript:
+            transcript_breaks = self._find_transcript_breaks(transcript)
+            console.print(f"[dim]Found {len(transcript_breaks)} break cues in transcript[/dim]")
 
-        # Step 3: Score each candidate with all available signals
+        # Step 3: Create segments using transcript breaks as primary boundaries,
+        # with silence boundaries refining the exact cut points
+        if transcript_breaks:
+            candidates = self._create_segments_from_breaks(
+                transcript_breaks, boundaries, audio_data, sample_rate, duration
+            )
+        else:
+            candidates = self._create_candidate_segments(boundaries, duration)
+
+        # Step 4: Score each candidate with all available signals
         for candidate in candidates:
             scores = {}
 
-            # Signal 1: Fingerprint matching (if library available)
             scores["fingerprint"] = self._score_fingerprint(
                 audio_data, sample_rate, candidate
             )
 
-            # Signal 2: Speaker identity
             if diarization:
                 scores["speaker"] = self._score_speaker_identity(
                     diarization, candidate
                 )
             else:
-                scores["speaker"] = 0.5  # neutral
+                scores["speaker"] = 0.5
 
-            # Signal 3: Audio characteristics
             scores["audio_chars"] = self._score_audio_characteristics(
                 audio_data, sample_rate, candidate
             )
 
-            # Signal 4: Structural heuristics
             if transcript:
                 scores["structural"] = self._score_structural(
                     transcript, candidate
@@ -154,7 +162,6 @@ class SegmentDetector:
             else:
                 scores["structural"] = 0.5
 
-            # Combined weighted score (higher = more likely commercial)
             commercial_score = (
                 self.weights.fingerprint_match * scores.get("fingerprint", 0.5) +
                 self.weights.speaker_identity * scores.get("speaker", 0.5) +
@@ -163,20 +170,24 @@ class SegmentDetector:
             )
 
             candidate.signals = scores
-            candidate.confidence = commercial_score
 
-            if commercial_score >= self.config.segment_detection.confidence_threshold:
-                candidate.segment_type = SegmentType.COMMERCIAL
+            # If segment was already typed by transcript breaks, keep it
+            if candidate.segment_type == SegmentType.UNKNOWN:
+                candidate.confidence = commercial_score
+                if commercial_score >= self.config.segment_detection.confidence_threshold:
+                    candidate.segment_type = SegmentType.COMMERCIAL
+                else:
+                    candidate.segment_type = SegmentType.SHOW_CONTENT
             else:
-                candidate.segment_type = SegmentType.SHOW_CONTENT
+                candidate.confidence = max(commercial_score, 0.80)
 
-        # Step 4: Merge adjacent segments of same type
+        # Step 5: Merge adjacent segments of same type
         merged = self._merge_adjacent(candidates)
 
-        # Step 5: Apply duration constraints
+        # Step 6: Apply duration constraints
         final = self._apply_constraints(merged)
 
-        # Step 6: Label show segments using show prep if available
+        # Step 7: Label show segments using show prep if available
         if show_prep:
             self._label_from_prep(final, transcript, show_prep)
 
@@ -246,6 +257,142 @@ class SegmentDetector:
 
         return boundaries
 
+    def _find_transcript_breaks(self, transcript) -> list[dict]:
+        """Find commercial break points from transcript content."""
+        break_cues = []
+        going_to_break = [
+            "take a quick break", "take a break", "go to commercial",
+            "going to break", "let's go to break", "we'll be right back",
+            "right back after", "news break coming up", "after the news",
+            "be right back", "stay tuned", "don't go anywhere",
+        ]
+        coming_back = [
+            "welcome back", "we're back", "we are back", "back from the break",
+            "back from break", "back on the", "back with you",
+        ]
+
+        for seg in transcript.segments:
+            text = seg.text.lower().strip()
+            for cue in going_to_break:
+                if cue in text:
+                    break_cues.append({
+                        "type": "break_start",
+                        "time": seg.end,
+                        "text": seg.text.strip(),
+                        "cue": cue,
+                    })
+                    break
+            for cue in coming_back:
+                if cue in text:
+                    break_cues.append({
+                        "type": "break_end",
+                        "time": seg.start,
+                        "text": seg.text.strip(),
+                        "cue": cue,
+                    })
+                    break
+
+        return break_cues
+
+    def _create_segments_from_breaks(self, transcript_breaks: list[dict],
+                                     silence_boundaries: list[float],
+                                     audio: np.ndarray, sr: int,
+                                     total_duration: float) -> list[DetectedSegment]:
+        """Create segments using transcript break cues as primary boundaries.
+
+        For each break_start, find the nearest silence boundary after it (exact cut point).
+        For each break_end, find the nearest silence boundary before it.
+        The gap between break_start and break_end = commercial break.
+        """
+        segments = []
+
+        # Pair up break_start with the next break_end
+        break_regions = []
+        i = 0
+        while i < len(transcript_breaks):
+            cue = transcript_breaks[i]
+            if cue["type"] == "break_start":
+                # Find the matching break_end
+                end_time = None
+                for j in range(i + 1, len(transcript_breaks)):
+                    if transcript_breaks[j]["type"] == "break_end":
+                        end_time = transcript_breaks[j]["time"]
+                        i = j + 1
+                        break
+                if end_time is None:
+                    # No matching end — assume break lasts until a reasonable point
+                    # (5 minutes max, or until end of audio)
+                    end_time = min(cue["time"] + 300, total_duration)
+                    i += 1
+
+                # Snap to nearest silence boundaries for clean cuts
+                start = self._nearest_silence(cue["time"], silence_boundaries, after=True)
+                end = self._nearest_silence(end_time, silence_boundaries, after=False)
+
+                if start and end and end > start:
+                    break_regions.append((start, end))
+                elif start:
+                    break_regions.append((start, end_time))
+            else:
+                i += 1
+
+        if not break_regions:
+            return self._create_candidate_segments(silence_boundaries, total_duration)
+
+        # Build segments: show → commercial → show → commercial → ...
+        prev_end = 0.0
+        for break_start, break_end in break_regions:
+            # Show content before this break
+            if break_start - prev_end > 1.0:
+                segments.append(DetectedSegment(
+                    start=prev_end,
+                    end=break_start,
+                    segment_type=SegmentType.SHOW_CONTENT,
+                    confidence=0.85,
+                    label="",
+                ))
+
+            # Commercial break
+            segments.append(DetectedSegment(
+                start=break_start,
+                end=break_end,
+                segment_type=SegmentType.COMMERCIAL,
+                confidence=0.85,
+                label="",
+            ))
+            prev_end = break_end
+
+        # Final show segment after last break
+        if total_duration - prev_end > 1.0:
+            segments.append(DetectedSegment(
+                start=prev_end,
+                end=total_duration,
+                segment_type=SegmentType.SHOW_CONTENT,
+                confidence=0.85,
+                label="",
+            ))
+
+        return segments
+
+    def _nearest_silence(self, time: float, boundaries: list[float],
+                         after: bool = True, max_distance: float = 10.0) -> float | None:
+        """Find the nearest silence boundary to a given time."""
+        best = None
+        best_dist = max_distance
+
+        for b in boundaries:
+            dist = abs(b - time)
+            if dist > max_distance:
+                continue
+            if after and b >= time and dist < best_dist:
+                best = b
+                best_dist = dist
+            elif not after and b <= time and dist < best_dist:
+                best = b
+                best_dist = dist
+
+        return best
+
     def _create_candidate_segments(self, boundaries: list[float],
                                    total_duration: float) -> list[DetectedSegment]:
         """Create candidate segments from silence boundaries."""
@@ -374,17 +521,20 @@ class SegmentDetector:
         return 0.5  # Neutral
 
     def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
-        """Merge adjacent segments of the same type."""
+        """Merge adjacent and overlapping segments of the same type."""
         if not segments:
             return []
 
+        # Sort by start time first
+        segments.sort(key=lambda s: s.start)
+
         merged = [segments[0]]
         for seg in segments[1:]:
             prev = merged[-1]
+            # Merge if same type AND (overlapping or within 2 seconds)
             if (prev.segment_type == seg.segment_type and
-                    abs(seg.start - prev.end) < 2.0):  # Within 2 seconds
-                # Extend previous segment
-                prev.end = seg.end
+                    seg.start <= prev.end + 2.0):
+                prev.end = max(prev.end, seg.end)
                 prev.confidence = (prev.confidence + seg.confidence) / 2
             else:
                 merged.append(seg)