- Voice profiler using microsoft/wavlm-base-sv (512-dim x-vector embeddings) - Bootstrap from archive: 180 embeddings from 9 episodes across 2010-2018 - Host identification accuracy: 0.87-0.98 similarity for live speech, 0.60-0.64 for non-host audio (produced intros, co-host) - Dropped speechbrain dependency (requires torchaudio, CUDA version conflicts) - Patched torchaudio CUDA 12.8/13.1 version check (warning instead of error) - Profile stored in voice-profiles/mike-swanson/ with per-chunk embeddings Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
400 lines
14 KiB
Python
400 lines
14 KiB
Python
"""CLI entry point for the radio show audio processor."""
|
|
|
|
# Must set CUDA paths before any torch/ctranslate2 imports
|
|
from .gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
|
|
from .config import load_config
|
|
|
|
console = Console()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Radio Show Audio Processor — The Computer Guru Show",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
%(prog)s process episode.mp3
|
|
%(prog)s process episode.mp3 --show-prep show-prep.md
|
|
%(prog)s process hr1.mp3 hr2.mp3 --archive-mode --date 2016-03-15
|
|
%(prog)s transcribe episode.mp3
|
|
%(prog)s bootstrap-voice archive/
|
|
%(prog)s review-elements
|
|
%(prog)s review-speakers
|
|
""",
|
|
)
|
|
parser.add_argument("--config", type=str, default=None,
|
|
help="Path to config.yaml")
|
|
|
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
|
|
# === process ===
|
|
p_process = subparsers.add_parser("process", help="Full pipeline")
|
|
p_process.add_argument("audio", nargs="+", type=str,
|
|
help="Audio file(s) to process")
|
|
p_process.add_argument("--show-prep", type=str, default=None,
|
|
help="Path to show prep markdown file")
|
|
p_process.add_argument("--output", type=str, default=None,
|
|
help="Output directory")
|
|
p_process.add_argument("--archive-mode", action="store_true",
|
|
help="Archive mode: learn elements and voices")
|
|
p_process.add_argument("--date", type=str, default=None,
|
|
help="Episode date (for archive mode)")
|
|
p_process.add_argument("--skip-transcribe", action="store_true",
|
|
help="Skip transcription (use existing transcript)")
|
|
p_process.add_argument("--skip-diarize", action="store_true",
|
|
help="Skip diarization")
|
|
p_process.add_argument("--skip-analysis", action="store_true",
|
|
help="Skip LLM analysis")
|
|
|
|
# === transcribe ===
|
|
p_transcribe = subparsers.add_parser("transcribe", help="Transcribe only")
|
|
p_transcribe.add_argument("audio", type=str, help="Audio file")
|
|
p_transcribe.add_argument("--output", type=str, default=None)
|
|
p_transcribe.add_argument("--model", type=str, default=None,
|
|
help="Whisper model size")
|
|
|
|
# === diarize ===
|
|
p_diarize = subparsers.add_parser("diarize", help="Diarize only")
|
|
p_diarize.add_argument("audio", type=str, help="Audio file")
|
|
p_diarize.add_argument("--output", type=str, default=None)
|
|
|
|
# === detect ===
|
|
p_detect = subparsers.add_parser("detect", help="Detect segments only")
|
|
p_detect.add_argument("audio", type=str, help="Audio file")
|
|
p_detect.add_argument("--output", type=str, default=None)
|
|
p_detect.add_argument("--show-prep", type=str, default=None)
|
|
|
|
# === split ===
|
|
p_split = subparsers.add_parser("split", help="Split into segments")
|
|
p_split.add_argument("audio", type=str, help="Audio file")
|
|
p_split.add_argument("--detection-report", type=str, required=True,
|
|
help="Path to detection-report.json")
|
|
p_split.add_argument("--output", type=str, default=None)
|
|
|
|
# === bootstrap-voice ===
|
|
p_voice = subparsers.add_parser("bootstrap-voice",
|
|
help="Bootstrap host voice profile from archive")
|
|
p_voice.add_argument("archive_dir", type=str,
|
|
help="Directory containing archive MP3s")
|
|
p_voice.add_argument("--speaker-name", type=str, default="Mike Swanson")
|
|
p_voice.add_argument("--sample-count", type=int, default=10,
|
|
help="Number of episodes to sample")
|
|
|
|
# === review-elements ===
|
|
subparsers.add_parser("review-elements",
|
|
help="Review discovered audio elements")
|
|
|
|
# === review-speakers ===
|
|
subparsers.add_parser("review-speakers",
|
|
help="Review unknown speaker clusters")
|
|
|
|
args = parser.parse_args()
|
|
config = load_config(args.config)
|
|
|
|
console.print(Panel.fit(
|
|
"[bold]Radio Show Audio Processor[/bold]\n"
|
|
f"[dim]The Computer Guru Show[/dim]",
|
|
border_style="blue",
|
|
))
|
|
|
|
if args.command == "process":
|
|
_cmd_process(args, config)
|
|
elif args.command == "transcribe":
|
|
_cmd_transcribe(args, config)
|
|
elif args.command == "diarize":
|
|
_cmd_diarize(args, config)
|
|
elif args.command == "detect":
|
|
_cmd_detect(args, config)
|
|
elif args.command == "split":
|
|
_cmd_split(args, config)
|
|
elif args.command == "bootstrap-voice":
|
|
_cmd_bootstrap_voice(args, config)
|
|
elif args.command == "review-elements":
|
|
_cmd_review_elements(args, config)
|
|
elif args.command == "review-speakers":
|
|
_cmd_review_speakers(args, config)
|
|
|
|
|
|
def _cmd_process(args, config):
|
|
"""Full processing pipeline."""
|
|
from .transcriber import transcribe
|
|
from .diarizer import diarize, VoiceProfileStore
|
|
from .segment_detector import SegmentDetector
|
|
from .audio_editor import remove_commercials, split_segments, generate_chapters
|
|
from .analyzer import analyze_episode
|
|
|
|
audio_files = [Path(f) for f in args.audio]
|
|
audio_path = audio_files[0] # Primary file
|
|
|
|
# If multiple files (HR1 + HR2), concatenate first
|
|
if len(audio_files) > 1:
|
|
audio_path = _concatenate_audio(audio_files, config)
|
|
|
|
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load show prep if provided
|
|
show_prep = None
|
|
if args.show_prep:
|
|
show_prep = Path(args.show_prep).read_text()
|
|
|
|
# Stage 1: Transcribe
|
|
transcript = None
|
|
if not args.skip_transcribe:
|
|
transcript = transcribe(
|
|
audio_path,
|
|
model_size=config.audio.whisper_model,
|
|
language=config.audio.whisper_language,
|
|
)
|
|
transcript.save(output_dir)
|
|
else:
|
|
console.print("[dim]Skipping transcription[/dim]")
|
|
# Try to load existing transcript
|
|
transcript_file = output_dir / "transcript.json"
|
|
if transcript_file.exists():
|
|
from .transcriber import Transcript, TranscriptSegment, TranscriptWord
|
|
import json
|
|
with open(transcript_file) as f:
|
|
data = json.load(f)
|
|
transcript = Transcript(
|
|
segments=[
|
|
TranscriptSegment(
|
|
id=s["id"], text=s["text"],
|
|
start=s["start"], end=s["end"],
|
|
words=[TranscriptWord(**w) for w in s.get("words", [])],
|
|
)
|
|
for s in data["segments"]
|
|
],
|
|
language=data["language"],
|
|
language_probability=data["language_probability"],
|
|
duration=data["duration"],
|
|
)
|
|
|
|
# Stage 2: Diarize
|
|
diarization = None
|
|
if not args.skip_diarize:
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
diarization = diarize(
|
|
audio_path,
|
|
voice_profiles=voice_profiles,
|
|
min_speakers=config.diarization.min_speakers,
|
|
max_speakers=config.diarization.max_speakers,
|
|
)
|
|
diarization.save(output_dir)
|
|
else:
|
|
console.print("[dim]Skipping diarization[/dim]")
|
|
|
|
# Stage 3: Detect segments
|
|
detector = SegmentDetector(config)
|
|
detection = detector.detect(
|
|
audio_path,
|
|
transcript=transcript,
|
|
diarization=diarization,
|
|
show_prep=show_prep,
|
|
)
|
|
detection.save(output_dir)
|
|
|
|
# Stage 4: Remove commercials
|
|
clean_path = output_dir / f"podcast-episode.{config.audio.output_format}"
|
|
remove_commercials(
|
|
audio_path, detection.segments, clean_path,
|
|
crossfade_ms=config.audio.crossfade_ms,
|
|
bitrate=config.audio.output_bitrate,
|
|
normalize=config.audio.normalize,
|
|
)
|
|
|
|
# Stage 5: Split segments
|
|
segments_dir = output_dir / "segments"
|
|
split_segments(
|
|
audio_path, detection.segments, segments_dir,
|
|
bitrate=config.audio.output_bitrate,
|
|
)
|
|
|
|
# Generate chapters
|
|
generate_chapters(detection.segments, output_dir / "chapters.json")
|
|
|
|
# Stage 6: Analyze
|
|
if not args.skip_analysis and transcript:
|
|
analysis = analyze_episode(
|
|
transcript_text=transcript.full_text,
|
|
diarization_data=diarization.to_dict() if diarization else None,
|
|
show_prep=show_prep,
|
|
segments=detection.segments,
|
|
model=config.llm.model,
|
|
ollama_host=config.llm.ollama_host,
|
|
)
|
|
generated_dir = output_dir.parent / "generated"
|
|
analysis.save(generated_dir)
|
|
|
|
console.print("\n[bold green]Processing complete![/bold green]")
|
|
console.print(f"Output: {output_dir}")
|
|
|
|
|
|
def _cmd_transcribe(args, config):
|
|
"""Transcribe only."""
|
|
from .transcriber import transcribe
|
|
|
|
audio_path = Path(args.audio)
|
|
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
|
|
model = args.model or config.audio.whisper_model
|
|
|
|
transcript = transcribe(audio_path, model_size=model)
|
|
transcript.save(output_dir)
|
|
|
|
|
|
def _cmd_diarize(args, config):
|
|
"""Diarize only."""
|
|
from .diarizer import diarize, VoiceProfileStore
|
|
|
|
audio_path = Path(args.audio)
|
|
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
result = diarize(audio_path, voice_profiles=voice_profiles)
|
|
result.save(output_dir)
|
|
|
|
|
|
def _cmd_detect(args, config):
|
|
"""Segment detection only."""
|
|
from .segment_detector import SegmentDetector
|
|
|
|
audio_path = Path(args.audio)
|
|
output_dir = Path(args.output) if args.output else audio_path.parent / "processed"
|
|
|
|
show_prep = None
|
|
if args.show_prep:
|
|
show_prep = Path(args.show_prep).read_text()
|
|
|
|
# Load existing transcript if available
|
|
transcript = None
|
|
transcript_file = output_dir / "transcript.json"
|
|
if transcript_file.exists():
|
|
from .transcriber import Transcript, TranscriptSegment, TranscriptWord
|
|
import json
|
|
console.print(f"[dim]Loading transcript from {transcript_file}[/dim]")
|
|
with open(transcript_file) as f:
|
|
data = json.load(f)
|
|
transcript = Transcript(
|
|
segments=[
|
|
TranscriptSegment(
|
|
id=s["id"], text=s["text"],
|
|
start=s["start"], end=s["end"],
|
|
words=[TranscriptWord(**w) for w in s.get("words", [])],
|
|
)
|
|
for s in data["segments"]
|
|
],
|
|
language=data["language"],
|
|
language_probability=data["language_probability"],
|
|
duration=data["duration"],
|
|
)
|
|
|
|
detector = SegmentDetector(config)
|
|
result = detector.detect(audio_path, transcript=transcript, show_prep=show_prep)
|
|
result.save(output_dir)
|
|
|
|
|
|
def _cmd_split(args, config):
|
|
"""Split using existing detection report."""
|
|
from .audio_editor import split_segments, generate_chapters
|
|
from .segment_detector import DetectedSegment, SegmentType
|
|
import json
|
|
|
|
audio_path = Path(args.audio)
|
|
output_dir = Path(args.output) if args.output else audio_path.parent / "segments"
|
|
|
|
with open(args.detection_report) as f:
|
|
report = json.load(f)
|
|
|
|
segments = [
|
|
DetectedSegment(
|
|
start=s["start"], end=s["end"],
|
|
segment_type=SegmentType(s["type"]),
|
|
confidence=s["confidence"],
|
|
label=s.get("label", ""),
|
|
)
|
|
for s in report["segments"]
|
|
]
|
|
|
|
split_segments(audio_path, segments, output_dir, config.audio.output_bitrate)
|
|
generate_chapters(segments, output_dir.parent / "chapters.json")
|
|
|
|
|
|
def _cmd_bootstrap_voice(args, config):
|
|
"""Bootstrap host voice profile from archive episodes."""
|
|
from .voice_profiler import VoiceProfiler
|
|
|
|
archive_dir = Path(args.archive_dir)
|
|
profiler = VoiceProfiler(
|
|
config.resolve_path(config.paths.voice_profiles),
|
|
device="cuda",
|
|
)
|
|
|
|
# Find MP3 files in archive directory
|
|
mp3_files = sorted(archive_dir.glob("**/*.mp3"))
|
|
if not mp3_files:
|
|
console.print(f"[red]No MP3 files found in {archive_dir}[/red]")
|
|
return
|
|
|
|
# Sample if we have more than requested
|
|
if len(mp3_files) > args.sample_count:
|
|
step = len(mp3_files) // args.sample_count
|
|
mp3_files = [mp3_files[i * step] for i in range(args.sample_count)]
|
|
|
|
console.print(f"[dim]Found {len(mp3_files)} episodes to process[/dim]")
|
|
|
|
profiler.bootstrap_host_from_episodes(mp3_files, host_name=args.speaker_name)
|
|
profiler.print_profiles()
|
|
|
|
|
|
def _cmd_review_elements(args, config):
|
|
"""Review discovered audio elements."""
|
|
console.print("[bold]Reviewing discovered elements[/bold]")
|
|
# TODO: Implement element review UI
|
|
console.print("[yellow]Not yet implemented[/yellow]")
|
|
|
|
|
|
def _cmd_review_speakers(args, config):
|
|
"""Review unknown speaker clusters."""
|
|
console.print("[bold]Reviewing unknown speakers[/bold]")
|
|
# TODO: Implement speaker review UI
|
|
console.print("[yellow]Not yet implemented[/yellow]")
|
|
|
|
|
|
def _concatenate_audio(files: list[Path], config) -> Path:
|
|
"""Concatenate multiple audio files (e.g., HR1 + HR2)."""
|
|
import subprocess
|
|
|
|
output = files[0].parent / f"combined_{files[0].stem}.mp3"
|
|
concat_file = files[0].parent / ".concat_list.txt"
|
|
|
|
with open(concat_file, "w") as f:
|
|
for audio_file in files:
|
|
f.write(f"file '{audio_file}'\n")
|
|
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-f", "concat", "-safe", "0",
|
|
"-i", str(concat_file), "-c", "copy", str(output)],
|
|
capture_output=True, check=True,
|
|
)
|
|
concat_file.unlink()
|
|
|
|
console.print(f"[dim]Concatenated {len(files)} files -> {output.name}[/dim]")
|
|
return output
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|