Files
claudetools/projects/radio-show/audio-processor/src/cli.py
Mike Swanson 826141a319 Audio processor: working voice profiler with WavLM speaker embeddings
- Voice profiler using microsoft/wavlm-base-sv (512-dim x-vector embeddings)
- Bootstrap from archive: 180 embeddings from 9 episodes across 2010-2018
- Host identification accuracy: 0.87-0.98 similarity for live speech,
  0.60-0.64 for non-host audio (produced intros, co-host)
- Dropped speechbrain dependency (requires torchaudio, CUDA version conflicts)
- Patched torchaudio CUDA 12.8/13.1 version check (warning instead of error)
- Profile stored in voice-profiles/mike-swanson/ with per-chunk embeddings

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 12:19:13 -07:00

400 lines
14 KiB
Python

"""CLI entry point for the radio show audio processor."""
# Must set CUDA paths before any torch/ctranslate2 imports
from .gpu import ensure_cuda_libs
ensure_cuda_libs()
import argparse
import sys
from pathlib import Path
from rich.console import Console
from rich.panel import Panel
from .config import load_config
console = Console()
def main():
parser = argparse.ArgumentParser(
description="Radio Show Audio Processor — The Computer Guru Show",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s process episode.mp3
%(prog)s process episode.mp3 --show-prep show-prep.md
%(prog)s process hr1.mp3 hr2.mp3 --archive-mode --date 2016-03-15
%(prog)s transcribe episode.mp3
%(prog)s bootstrap-voice archive/
%(prog)s review-elements
%(prog)s review-speakers
""",
)
parser.add_argument("--config", type=str, default=None,
help="Path to config.yaml")
subparsers = parser.add_subparsers(dest="command", required=True)
# === process ===
p_process = subparsers.add_parser("process", help="Full pipeline")
p_process.add_argument("audio", nargs="+", type=str,
help="Audio file(s) to process")
p_process.add_argument("--show-prep", type=str, default=None,
help="Path to show prep markdown file")
p_process.add_argument("--output", type=str, default=None,
help="Output directory")
p_process.add_argument("--archive-mode", action="store_true",
help="Archive mode: learn elements and voices")
p_process.add_argument("--date", type=str, default=None,
help="Episode date (for archive mode)")
p_process.add_argument("--skip-transcribe", action="store_true",
help="Skip transcription (use existing transcript)")
p_process.add_argument("--skip-diarize", action="store_true",
help="Skip diarization")
p_process.add_argument("--skip-analysis", action="store_true",
help="Skip LLM analysis")
# === transcribe ===
p_transcribe = subparsers.add_parser("transcribe", help="Transcribe only")
p_transcribe.add_argument("audio", type=str, help="Audio file")
p_transcribe.add_argument("--output", type=str, default=None)
p_transcribe.add_argument("--model", type=str, default=None,
help="Whisper model size")
# === diarize ===
p_diarize = subparsers.add_parser("diarize", help="Diarize only")
p_diarize.add_argument("audio", type=str, help="Audio file")
p_diarize.add_argument("--output", type=str, default=None)
# === detect ===
p_detect = subparsers.add_parser("detect", help="Detect segments only")
p_detect.add_argument("audio", type=str, help="Audio file")
p_detect.add_argument("--output", type=str, default=None)
p_detect.add_argument("--show-prep", type=str, default=None)
# === split ===
p_split = subparsers.add_parser("split", help="Split into segments")
p_split.add_argument("audio", type=str, help="Audio file")
p_split.add_argument("--detection-report", type=str, required=True,
help="Path to detection-report.json")
p_split.add_argument("--output", type=str, default=None)
# === bootstrap-voice ===
p_voice = subparsers.add_parser("bootstrap-voice",
help="Bootstrap host voice profile from archive")
p_voice.add_argument("archive_dir", type=str,
help="Directory containing archive MP3s")
p_voice.add_argument("--speaker-name", type=str, default="Mike Swanson")
p_voice.add_argument("--sample-count", type=int, default=10,
help="Number of episodes to sample")
# === review-elements ===
subparsers.add_parser("review-elements",
help="Review discovered audio elements")
# === review-speakers ===
subparsers.add_parser("review-speakers",
help="Review unknown speaker clusters")
args = parser.parse_args()
config = load_config(args.config)
console.print(Panel.fit(
"[bold]Radio Show Audio Processor[/bold]\n"
f"[dim]The Computer Guru Show[/dim]",
border_style="blue",
))
if args.command == "process":
_cmd_process(args, config)
elif args.command == "transcribe":
_cmd_transcribe(args, config)
elif args.command == "diarize":
_cmd_diarize(args, config)
elif args.command == "detect":
_cmd_detect(args, config)
elif args.command == "split":
_cmd_split(args, config)
elif args.command == "bootstrap-voice":
_cmd_bootstrap_voice(args, config)
elif args.command == "review-elements":
_cmd_review_elements(args, config)
elif args.command == "review-speakers":
_cmd_review_speakers(args, config)
def _cmd_process(args, config):
"""Full processing pipeline."""
from .transcriber import transcribe
from .diarizer import diarize, VoiceProfileStore
from .segment_detector import SegmentDetector
from .audio_editor import remove_commercials, split_segments, generate_chapters
from .analyzer import analyze_episode
audio_files = [Path(f) for f in args.audio]
audio_path = audio_files[0] # Primary file
# If multiple files (HR1 + HR2), concatenate first
if len(audio_files) > 1:
audio_path = _concatenate_audio(audio_files, config)
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
output_dir.mkdir(parents=True, exist_ok=True)
# Load show prep if provided
show_prep = None
if args.show_prep:
show_prep = Path(args.show_prep).read_text()
# Stage 1: Transcribe
transcript = None
if not args.skip_transcribe:
transcript = transcribe(
audio_path,
model_size=config.audio.whisper_model,
language=config.audio.whisper_language,
)
transcript.save(output_dir)
else:
console.print("[dim]Skipping transcription[/dim]")
# Try to load existing transcript
transcript_file = output_dir / "transcript.json"
if transcript_file.exists():
from .transcriber import Transcript, TranscriptSegment, TranscriptWord
import json
with open(transcript_file) as f:
data = json.load(f)
transcript = Transcript(
segments=[
TranscriptSegment(
id=s["id"], text=s["text"],
start=s["start"], end=s["end"],
words=[TranscriptWord(**w) for w in s.get("words", [])],
)
for s in data["segments"]
],
language=data["language"],
language_probability=data["language_probability"],
duration=data["duration"],
)
# Stage 2: Diarize
diarization = None
if not args.skip_diarize:
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
diarization = diarize(
audio_path,
voice_profiles=voice_profiles,
min_speakers=config.diarization.min_speakers,
max_speakers=config.diarization.max_speakers,
)
diarization.save(output_dir)
else:
console.print("[dim]Skipping diarization[/dim]")
# Stage 3: Detect segments
detector = SegmentDetector(config)
detection = detector.detect(
audio_path,
transcript=transcript,
diarization=diarization,
show_prep=show_prep,
)
detection.save(output_dir)
# Stage 4: Remove commercials
clean_path = output_dir / f"podcast-episode.{config.audio.output_format}"
remove_commercials(
audio_path, detection.segments, clean_path,
crossfade_ms=config.audio.crossfade_ms,
bitrate=config.audio.output_bitrate,
normalize=config.audio.normalize,
)
# Stage 5: Split segments
segments_dir = output_dir / "segments"
split_segments(
audio_path, detection.segments, segments_dir,
bitrate=config.audio.output_bitrate,
)
# Generate chapters
generate_chapters(detection.segments, output_dir / "chapters.json")
# Stage 6: Analyze
if not args.skip_analysis and transcript:
analysis = analyze_episode(
transcript_text=transcript.full_text,
diarization_data=diarization.to_dict() if diarization else None,
show_prep=show_prep,
segments=detection.segments,
model=config.llm.model,
ollama_host=config.llm.ollama_host,
)
generated_dir = output_dir.parent / "generated"
analysis.save(generated_dir)
console.print("\n[bold green]Processing complete![/bold green]")
console.print(f"Output: {output_dir}")
def _cmd_transcribe(args, config):
"""Transcribe only."""
from .transcriber import transcribe
audio_path = Path(args.audio)
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
model = args.model or config.audio.whisper_model
transcript = transcribe(audio_path, model_size=model)
transcript.save(output_dir)
def _cmd_diarize(args, config):
"""Diarize only."""
from .diarizer import diarize, VoiceProfileStore
audio_path = Path(args.audio)
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
result = diarize(audio_path, voice_profiles=voice_profiles)
result.save(output_dir)
def _cmd_detect(args, config):
"""Segment detection only."""
from .segment_detector import SegmentDetector
audio_path = Path(args.audio)
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
show_prep = None
if args.show_prep:
show_prep = Path(args.show_prep).read_text()
# Load existing transcript if available
transcript = None
transcript_file = output_dir / "transcript.json"
if transcript_file.exists():
from .transcriber import Transcript, TranscriptSegment, TranscriptWord
import json
console.print(f"[dim]Loading transcript from {transcript_file}[/dim]")
with open(transcript_file) as f:
data = json.load(f)
transcript = Transcript(
segments=[
TranscriptSegment(
id=s["id"], text=s["text"],
start=s["start"], end=s["end"],
words=[TranscriptWord(**w) for w in s.get("words", [])],
)
for s in data["segments"]
],
language=data["language"],
language_probability=data["language_probability"],
duration=data["duration"],
)
detector = SegmentDetector(config)
result = detector.detect(audio_path, transcript=transcript, show_prep=show_prep)
result.save(output_dir)
def _cmd_split(args, config):
"""Split using existing detection report."""
from .audio_editor import split_segments, generate_chapters
from .segment_detector import DetectedSegment, SegmentType
import json
audio_path = Path(args.audio)
output_dir = Path(args.output) if args.output else audio_path.parent / "segments"
with open(args.detection_report) as f:
report = json.load(f)
segments = [
DetectedSegment(
start=s["start"], end=s["end"],
segment_type=SegmentType(s["type"]),
confidence=s["confidence"],
label=s.get("label", ""),
)
for s in report["segments"]
]
split_segments(audio_path, segments, output_dir, config.audio.output_bitrate)
generate_chapters(segments, output_dir.parent / "chapters.json")
def _cmd_bootstrap_voice(args, config):
"""Bootstrap host voice profile from archive episodes."""
from .voice_profiler import VoiceProfiler
archive_dir = Path(args.archive_dir)
profiler = VoiceProfiler(
config.resolve_path(config.paths.voice_profiles),
device="cuda",
)
# Find MP3 files in archive directory
mp3_files = sorted(archive_dir.glob("**/*.mp3"))
if not mp3_files:
console.print(f"[red]No MP3 files found in {archive_dir}[/red]")
return
# Sample if we have more than requested
if len(mp3_files) > args.sample_count:
step = len(mp3_files) // args.sample_count
mp3_files = [mp3_files[i * step] for i in range(args.sample_count)]
console.print(f"[dim]Found {len(mp3_files)} episodes to process[/dim]")
profiler.bootstrap_host_from_episodes(mp3_files, host_name=args.speaker_name)
profiler.print_profiles()
def _cmd_review_elements(args, config):
"""Review discovered audio elements."""
console.print("[bold]Reviewing discovered elements[/bold]")
# TODO: Implement element review UI
console.print("[yellow]Not yet implemented[/yellow]")
def _cmd_review_speakers(args, config):
"""Review unknown speaker clusters."""
console.print("[bold]Reviewing unknown speakers[/bold]")
# TODO: Implement speaker review UI
console.print("[yellow]Not yet implemented[/yellow]")
def _concatenate_audio(files: list[Path], config) -> Path:
"""Concatenate multiple audio files (e.g., HR1 + HR2)."""
import subprocess
output = files[0].parent / f"combined_{files[0].stem}.mp3"
concat_file = files[0].parent / ".concat_list.txt"
with open(concat_file, "w") as f:
for audio_file in files:
f.write(f"file '{audio_file}'\n")
subprocess.run(
["ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", str(concat_file), "-c", "copy", str(output)],
capture_output=True, check=True,
)
concat_file.unlink()
console.print(f"[dim]Concatenated {len(files)} files -> {output.name}[/dim]")
return output
if __name__ == "__main__":
main()