Audio processor: fix segment detection with transcript-driven breaks
- Add transcript break phrase detection (going_to_break/coming_back cues) - Create segments from transcript breaks with silence boundary snapping - Fix segment dedup in merge_adjacent (handle overlapping segments) - Add CUDA 12 library path fix (gpu.py + venv activate hook) - Auto-load existing transcript in detect command - Tested on 2011-03-05 HR1: correctly identifies commercial break at 34:38 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,9 @@
|
|||||||
"""CLI entry point for the radio show audio processor."""
|
"""CLI entry point for the radio show audio processor."""
|
||||||
|
|
||||||
|
# Must set CUDA paths before any torch/ctranslate2 imports
|
||||||
|
from .gpu import ensure_cuda_libs
|
||||||
|
ensure_cuda_libs()
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -274,8 +278,31 @@ def _cmd_detect(args, config):
|
|||||||
if args.show_prep:
|
if args.show_prep:
|
||||||
show_prep = Path(args.show_prep).read_text()
|
show_prep = Path(args.show_prep).read_text()
|
||||||
|
|
||||||
|
# Load existing transcript if available
|
||||||
|
transcript = None
|
||||||
|
transcript_file = output_dir / "transcript.json"
|
||||||
|
if transcript_file.exists():
|
||||||
|
from .transcriber import Transcript, TranscriptSegment, TranscriptWord
|
||||||
|
import json
|
||||||
|
console.print(f"[dim]Loading transcript from {transcript_file}[/dim]")
|
||||||
|
with open(transcript_file) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
transcript = Transcript(
|
||||||
|
segments=[
|
||||||
|
TranscriptSegment(
|
||||||
|
id=s["id"], text=s["text"],
|
||||||
|
start=s["start"], end=s["end"],
|
||||||
|
words=[TranscriptWord(**w) for w in s.get("words", [])],
|
||||||
|
)
|
||||||
|
for s in data["segments"]
|
||||||
|
],
|
||||||
|
language=data["language"],
|
||||||
|
language_probability=data["language_probability"],
|
||||||
|
duration=data["duration"],
|
||||||
|
)
|
||||||
|
|
||||||
detector = SegmentDetector(config)
|
detector = SegmentDetector(config)
|
||||||
result = detector.detect(audio_path, show_prep=show_prep)
|
result = detector.detect(audio_path, transcript=transcript, show_prep=show_prep)
|
||||||
result.save(output_dir)
|
result.save(output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
17
projects/radio-show/audio-processor/src/gpu.py
Normal file
17
projects/radio-show/audio-processor/src/gpu.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
"""GPU and CUDA library setup for the audio processor."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_cuda_libs():
|
||||||
|
"""Ensure CUDA 12 libraries are on LD_LIBRARY_PATH.
|
||||||
|
|
||||||
|
The system has CUDA 13.2 but faster-whisper's ctranslate2 needs CUDA 12.
|
||||||
|
Ollama ships CUDA 12 libs at /usr/local/lib/ollama/cuda_v12/.
|
||||||
|
"""
|
||||||
|
cuda12_path = "/usr/local/lib/ollama/cuda_v12"
|
||||||
|
if Path(cuda12_path).exists():
|
||||||
|
current = os.environ.get("LD_LIBRARY_PATH", "")
|
||||||
|
if cuda12_path not in current:
|
||||||
|
os.environ["LD_LIBRARY_PATH"] = f"{cuda12_path}:{current}" if current else cuda12_path
|
||||||
@@ -121,32 +121,40 @@ class SegmentDetector:
|
|||||||
boundaries = self._detect_silence_boundaries(audio_data, sample_rate)
|
boundaries = self._detect_silence_boundaries(audio_data, sample_rate)
|
||||||
console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]")
|
console.print(f"[dim]Found {len(boundaries)} silence boundaries[/dim]")
|
||||||
|
|
||||||
# Step 2: Create candidate segments between boundaries
|
# Step 2: Find hard break points from transcript (most reliable signal)
|
||||||
candidates = self._create_candidate_segments(boundaries, duration)
|
transcript_breaks = []
|
||||||
|
if transcript:
|
||||||
|
transcript_breaks = self._find_transcript_breaks(transcript)
|
||||||
|
console.print(f"[dim]Found {len(transcript_breaks)} break cues in transcript[/dim]")
|
||||||
|
|
||||||
# Step 3: Score each candidate with all available signals
|
# Step 3: Create segments using transcript breaks as primary boundaries,
|
||||||
|
# with silence boundaries refining the exact cut points
|
||||||
|
if transcript_breaks:
|
||||||
|
candidates = self._create_segments_from_breaks(
|
||||||
|
transcript_breaks, boundaries, audio_data, sample_rate, duration
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
candidates = self._create_candidate_segments(boundaries, duration)
|
||||||
|
|
||||||
|
# Step 4: Score each candidate with all available signals
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
scores = {}
|
scores = {}
|
||||||
|
|
||||||
# Signal 1: Fingerprint matching (if library available)
|
|
||||||
scores["fingerprint"] = self._score_fingerprint(
|
scores["fingerprint"] = self._score_fingerprint(
|
||||||
audio_data, sample_rate, candidate
|
audio_data, sample_rate, candidate
|
||||||
)
|
)
|
||||||
|
|
||||||
# Signal 2: Speaker identity
|
|
||||||
if diarization:
|
if diarization:
|
||||||
scores["speaker"] = self._score_speaker_identity(
|
scores["speaker"] = self._score_speaker_identity(
|
||||||
diarization, candidate
|
diarization, candidate
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
scores["speaker"] = 0.5 # neutral
|
scores["speaker"] = 0.5
|
||||||
|
|
||||||
# Signal 3: Audio characteristics
|
|
||||||
scores["audio_chars"] = self._score_audio_characteristics(
|
scores["audio_chars"] = self._score_audio_characteristics(
|
||||||
audio_data, sample_rate, candidate
|
audio_data, sample_rate, candidate
|
||||||
)
|
)
|
||||||
|
|
||||||
# Signal 4: Structural heuristics
|
|
||||||
if transcript:
|
if transcript:
|
||||||
scores["structural"] = self._score_structural(
|
scores["structural"] = self._score_structural(
|
||||||
transcript, candidate
|
transcript, candidate
|
||||||
@@ -154,7 +162,6 @@ class SegmentDetector:
|
|||||||
else:
|
else:
|
||||||
scores["structural"] = 0.5
|
scores["structural"] = 0.5
|
||||||
|
|
||||||
# Combined weighted score (higher = more likely commercial)
|
|
||||||
commercial_score = (
|
commercial_score = (
|
||||||
self.weights.fingerprint_match * scores.get("fingerprint", 0.5) +
|
self.weights.fingerprint_match * scores.get("fingerprint", 0.5) +
|
||||||
self.weights.speaker_identity * scores.get("speaker", 0.5) +
|
self.weights.speaker_identity * scores.get("speaker", 0.5) +
|
||||||
@@ -163,20 +170,24 @@ class SegmentDetector:
|
|||||||
)
|
)
|
||||||
|
|
||||||
candidate.signals = scores
|
candidate.signals = scores
|
||||||
candidate.confidence = commercial_score
|
|
||||||
|
|
||||||
if commercial_score >= self.config.segment_detection.confidence_threshold:
|
# If segment was already typed by transcript breaks, keep it
|
||||||
candidate.segment_type = SegmentType.COMMERCIAL
|
if candidate.segment_type == SegmentType.UNKNOWN:
|
||||||
|
candidate.confidence = commercial_score
|
||||||
|
if commercial_score >= self.config.segment_detection.confidence_threshold:
|
||||||
|
candidate.segment_type = SegmentType.COMMERCIAL
|
||||||
|
else:
|
||||||
|
candidate.segment_type = SegmentType.SHOW_CONTENT
|
||||||
else:
|
else:
|
||||||
candidate.segment_type = SegmentType.SHOW_CONTENT
|
candidate.confidence = max(commercial_score, 0.80)
|
||||||
|
|
||||||
# Step 4: Merge adjacent segments of same type
|
# Step 5: Merge adjacent segments of same type
|
||||||
merged = self._merge_adjacent(candidates)
|
merged = self._merge_adjacent(candidates)
|
||||||
|
|
||||||
# Step 5: Apply duration constraints
|
# Step 6: Apply duration constraints
|
||||||
final = self._apply_constraints(merged)
|
final = self._apply_constraints(merged)
|
||||||
|
|
||||||
# Step 6: Label show segments using show prep if available
|
# Step 7: Label show segments using show prep if available
|
||||||
if show_prep:
|
if show_prep:
|
||||||
self._label_from_prep(final, transcript, show_prep)
|
self._label_from_prep(final, transcript, show_prep)
|
||||||
|
|
||||||
@@ -246,6 +257,142 @@ class SegmentDetector:
|
|||||||
|
|
||||||
return boundaries
|
return boundaries
|
||||||
|
|
||||||
|
def _find_transcript_breaks(self, transcript) -> list[dict]:
|
||||||
|
"""Find commercial break points from transcript content."""
|
||||||
|
break_cues = []
|
||||||
|
going_to_break = [
|
||||||
|
"take a quick break", "take a break", "go to commercial",
|
||||||
|
"going to break", "let's go to break", "we'll be right back",
|
||||||
|
"right back after", "news break coming up", "after the news",
|
||||||
|
"be right back", "stay tuned", "don't go anywhere",
|
||||||
|
]
|
||||||
|
coming_back = [
|
||||||
|
"welcome back", "we're back", "we are back", "back from the break",
|
||||||
|
"back from break", "back on the", "back with you",
|
||||||
|
]
|
||||||
|
|
||||||
|
for seg in transcript.segments:
|
||||||
|
text = seg.text.lower().strip()
|
||||||
|
for cue in going_to_break:
|
||||||
|
if cue in text:
|
||||||
|
break_cues.append({
|
||||||
|
"type": "break_start",
|
||||||
|
"time": seg.end,
|
||||||
|
"text": seg.text.strip(),
|
||||||
|
"cue": cue,
|
||||||
|
})
|
||||||
|
break
|
||||||
|
for cue in coming_back:
|
||||||
|
if cue in text:
|
||||||
|
break_cues.append({
|
||||||
|
"type": "break_end",
|
||||||
|
"time": seg.start,
|
||||||
|
"text": seg.text.strip(),
|
||||||
|
"cue": cue,
|
||||||
|
})
|
||||||
|
break
|
||||||
|
|
||||||
|
return break_cues
|
||||||
|
|
||||||
|
def _create_segments_from_breaks(self, transcript_breaks: list[dict],
|
||||||
|
silence_boundaries: list[float],
|
||||||
|
audio: np.ndarray, sr: int,
|
||||||
|
total_duration: float) -> list[DetectedSegment]:
|
||||||
|
"""Create segments using transcript break cues as primary boundaries.
|
||||||
|
|
||||||
|
For each break_start, find the nearest silence boundary after it (exact cut point).
|
||||||
|
For each break_end, find the nearest silence boundary before it.
|
||||||
|
The gap between break_start and break_end = commercial break.
|
||||||
|
"""
|
||||||
|
segments = []
|
||||||
|
|
||||||
|
# Pair up break_start with the next break_end
|
||||||
|
break_regions = []
|
||||||
|
i = 0
|
||||||
|
while i < len(transcript_breaks):
|
||||||
|
cue = transcript_breaks[i]
|
||||||
|
if cue["type"] == "break_start":
|
||||||
|
# Find the matching break_end
|
||||||
|
end_time = None
|
||||||
|
for j in range(i + 1, len(transcript_breaks)):
|
||||||
|
if transcript_breaks[j]["type"] == "break_end":
|
||||||
|
end_time = transcript_breaks[j]["time"]
|
||||||
|
i = j + 1
|
||||||
|
break
|
||||||
|
if end_time is None:
|
||||||
|
# No matching end — assume break lasts until a reasonable point
|
||||||
|
# (5 minutes max, or until end of audio)
|
||||||
|
end_time = min(cue["time"] + 300, total_duration)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Snap to nearest silence boundaries for clean cuts
|
||||||
|
start = self._nearest_silence(cue["time"], silence_boundaries, after=True)
|
||||||
|
end = self._nearest_silence(end_time, silence_boundaries, after=False)
|
||||||
|
|
||||||
|
if start and end and end > start:
|
||||||
|
break_regions.append((start, end))
|
||||||
|
elif start:
|
||||||
|
break_regions.append((start, end_time))
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if not break_regions:
|
||||||
|
return self._create_candidate_segments(silence_boundaries, total_duration)
|
||||||
|
|
||||||
|
# Build segments: show → commercial → show → commercial → ...
|
||||||
|
prev_end = 0.0
|
||||||
|
for break_start, break_end in break_regions:
|
||||||
|
# Show content before this break
|
||||||
|
if break_start - prev_end > 1.0:
|
||||||
|
segments.append(DetectedSegment(
|
||||||
|
start=prev_end,
|
||||||
|
end=break_start,
|
||||||
|
segment_type=SegmentType.SHOW_CONTENT,
|
||||||
|
confidence=0.85,
|
||||||
|
label="",
|
||||||
|
))
|
||||||
|
|
||||||
|
# Commercial break
|
||||||
|
segments.append(DetectedSegment(
|
||||||
|
start=break_start,
|
||||||
|
end=break_end,
|
||||||
|
segment_type=SegmentType.COMMERCIAL,
|
||||||
|
confidence=0.85,
|
||||||
|
label="",
|
||||||
|
))
|
||||||
|
prev_end = break_end
|
||||||
|
|
||||||
|
# Final show segment after last break
|
||||||
|
if total_duration - prev_end > 1.0:
|
||||||
|
segments.append(DetectedSegment(
|
||||||
|
start=prev_end,
|
||||||
|
end=total_duration,
|
||||||
|
segment_type=SegmentType.SHOW_CONTENT,
|
||||||
|
confidence=0.85,
|
||||||
|
label="",
|
||||||
|
))
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
def _nearest_silence(self, time: float, boundaries: list[float],
|
||||||
|
after: bool = True, max_distance: float = 10.0) -> float | None:
|
||||||
|
"""Find the nearest silence boundary to a given time."""
|
||||||
|
best = None
|
||||||
|
best_dist = max_distance
|
||||||
|
|
||||||
|
for b in boundaries:
|
||||||
|
dist = abs(b - time)
|
||||||
|
if dist > max_distance:
|
||||||
|
continue
|
||||||
|
if after and b >= time and dist < best_dist:
|
||||||
|
best = b
|
||||||
|
best_dist = dist
|
||||||
|
elif not after and b <= time and dist < best_dist:
|
||||||
|
best = b
|
||||||
|
best_dist = dist
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
def _create_candidate_segments(self, boundaries: list[float],
|
def _create_candidate_segments(self, boundaries: list[float],
|
||||||
total_duration: float) -> list[DetectedSegment]:
|
total_duration: float) -> list[DetectedSegment]:
|
||||||
"""Create candidate segments from silence boundaries."""
|
"""Create candidate segments from silence boundaries."""
|
||||||
@@ -374,17 +521,20 @@ class SegmentDetector:
|
|||||||
return 0.5 # Neutral
|
return 0.5 # Neutral
|
||||||
|
|
||||||
def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
|
def _merge_adjacent(self, segments: list[DetectedSegment]) -> list[DetectedSegment]:
|
||||||
"""Merge adjacent segments of the same type."""
|
"""Merge adjacent and overlapping segments of the same type."""
|
||||||
if not segments:
|
if not segments:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Sort by start time first
|
||||||
|
segments.sort(key=lambda s: s.start)
|
||||||
|
|
||||||
merged = [segments[0]]
|
merged = [segments[0]]
|
||||||
for seg in segments[1:]:
|
for seg in segments[1:]:
|
||||||
prev = merged[-1]
|
prev = merged[-1]
|
||||||
|
# Merge if same type AND (overlapping or within 2 seconds)
|
||||||
if (prev.segment_type == seg.segment_type and
|
if (prev.segment_type == seg.segment_type and
|
||||||
abs(seg.start - prev.end) < 2.0): # Within 2 seconds
|
seg.start <= prev.end + 2.0):
|
||||||
# Extend previous segment
|
prev.end = max(prev.end, seg.end)
|
||||||
prev.end = seg.end
|
|
||||||
prev.confidence = (prev.confidence + seg.confidence) / 2
|
prev.confidence = (prev.confidence + seg.confidence) / 2
|
||||||
else:
|
else:
|
||||||
merged.append(seg)
|
merged.append(seg)
|
||||||
|
|||||||
Reference in New Issue
Block a user