Add batch transcription scripts and 8 episode transcripts

Created Mac M4 batch transcription using mlx-whisper with Apple Silicon GPU acceleration. Transcribed 8 remaining episodes (17,555 total segments). Scripts: - batch_transcribe_mac.py: Full batch processor with mlx-whisper - test_mac_transcribe.py: Quick test script for faster-whisper Transcripts (JSON, SRT, TXT formats): - 2011-06-04-hr1: 1,503 segments - 2011-09-10-hr1: 1,378 segments - 2014-s6e05: 1,340 segments - 2015-s7e30: 1,053 segments - 2016-s8e42: 2,205 segments - 2017-s9e26: 2,366 segments - 2018-s10e17: 4,683 segments - 2018-s10e21: 2,493 segments All 9 episodes now transcribed (8 on Mac + 1 from Linux). Ready for Stages 3-6 on Linux PC. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-21 23:12:06 -07:00
parent 89a862c993
commit a3a47f2d5e
26 changed files with 801976 additions and 0 deletions
--- a/projects/radio-show/audio-processor/test_mac_transcribe.py
+++ b/projects/radio-show/audio-processor/test_mac_transcribe.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""
+Quick test script to verify faster-whisper works on Mac M4.
+Transcribes first 60 seconds of an episode.
+"""
+
+import time
+from pathlib import Path
+
+from faster_whisper import WhisperModel
+from pydub import AudioSegment
+
+# Config
+EPISODE = Path("training-data/episodes/2011-06-04-hr1.mp3")
+TEST_DURATION_MS = 60_000  # 60 seconds
+MODEL_SIZE = "base"  # Start small for testing, switch to large-v3 for production
+
+def main():
+    print(f"[INFO] Loading {MODEL_SIZE} model on CPU...")
+    start = time.time()
+
+    # Use CPU - faster-whisper/ctranslate2 doesn't support MPS
+    model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
+    print(f"[OK] Model loaded in {time.time() - start:.1f}s")
+
+    # Extract first 60 seconds
+    print(f"[INFO] Extracting first {TEST_DURATION_MS // 1000}s from {EPISODE.name}...")
+    audio = AudioSegment.from_mp3(str(EPISODE))
+    test_clip = audio[:TEST_DURATION_MS]
+
+    # Export to temp file
+    temp_file = Path("/tmp/test_clip.wav")
+    test_clip.export(str(temp_file), format="wav")
+    print(f"[OK] Test clip exported ({temp_file.stat().st_size // 1024}KB)")
+
+    # Transcribe
+    print("[INFO] Transcribing...")
+    start = time.time()
+
+    segments, info = model.transcribe(
+        str(temp_file),
+        language="en",
+        beam_size=5,
+        vad_filter=True,
+    )
+
+    # Collect segments
+    results = []
+    for seg in segments:
+        results.append({
+            "start": seg.start,
+            "end": seg.end,
+            "text": seg.text.strip()
+        })
+
+    elapsed = time.time() - start
+
+    print(f"[OK] Transcription complete in {elapsed:.1f}s")
+    print(f"[INFO] Speed: {TEST_DURATION_MS / 1000 / elapsed:.2f}x realtime")
+    print(f"[INFO] Segments: {len(results)}")
+    print()
+    print("=" * 60)
+    print("TRANSCRIPT:")
+    print("=" * 60)
+
+    for seg in results:
+        print(f"[{seg['start']:.1f}s - {seg['end']:.1f}s] {seg['text']}")
+
+    # Cleanup
+    temp_file.unlink()
+    print()
+    print("[SUCCESS] Test complete!")
+
+if __name__ == "__main__":
+    main()