Add batch transcription scripts and 8 episode transcripts

Created Mac M4 batch transcription using mlx-whisper with Apple Silicon
GPU acceleration. Transcribed 8 remaining episodes (17,555 total segments).

Scripts:
- batch_transcribe_mac.py: Full batch processor with mlx-whisper
- test_mac_transcribe.py: Quick test script for faster-whisper

Transcripts (JSON, SRT, TXT formats):
- 2011-06-04-hr1: 1,503 segments
- 2011-09-10-hr1: 1,378 segments
- 2014-s6e05: 1,340 segments
- 2015-s7e30: 1,053 segments
- 2016-s8e42: 2,205 segments
- 2017-s9e26: 2,366 segments
- 2018-s10e17: 4,683 segments
- 2018-s10e21: 2,493 segments

All 9 episodes now transcribed (8 on Mac + 1 from Linux).
Ready for Stages 3-6 on Linux PC.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 23:12:06 -07:00
parent 89a862c993
commit a3a47f2d5e
26 changed files with 801976 additions and 0 deletions

View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
Batch transcription script for Mac M4 using mlx-whisper.
Transcribes all pending episodes using Apple Silicon GPU acceleration.
"""
import json
import time
from pathlib import Path
from datetime import timedelta
import mlx_whisper
from pydub import AudioSegment
# Configuration
EPISODES_DIR = Path("training-data/episodes")
TRANSCRIPTS_DIR = Path("training-data/transcripts")
MODEL = "mlx-community/whisper-large-v3-mlx"
# Episodes to transcribe (skip already completed ones)
COMPLETED = {"2010-10-02-hr1"} # Already transcribed on Linux
def format_timestamp(seconds: float) -> str:
"""Format seconds as SRT timestamp (HH:MM:SS,mmm)."""
td = timedelta(seconds=seconds)
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
ms = td.microseconds // 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
def transcribe_episode(episode_path: Path) -> dict:
"""Transcribe a single episode and return results."""
print(f"[INFO] Transcribing {episode_path.name}...")
start_time = time.time()
# Get audio duration
audio = AudioSegment.from_mp3(str(episode_path))
duration_seconds = len(audio) / 1000.0
# Transcribe with word timestamps
result = mlx_whisper.transcribe(
str(episode_path),
path_or_hf_repo=MODEL,
language="en",
word_timestamps=True,
)
elapsed = time.time() - start_time
speed = duration_seconds / elapsed
print(f"[OK] Done in {elapsed:.1f}s ({speed:.1f}x realtime)")
print(f"[INFO] Segments: {len(result.get('segments', []))}")
return result, duration_seconds
def save_transcript(result: dict, duration: float, output_dir: Path):
"""Save transcript in JSON, SRT, and TXT formats."""
output_dir.mkdir(parents=True, exist_ok=True)
segments = result.get("segments", [])
# Build output structure matching existing format
output = {
"language": result.get("language", "en"),
"language_probability": 1.0,
"duration": duration,
"segments": []
}
for i, seg in enumerate(segments):
segment_data = {
"id": i,
"text": seg.get("text", "").strip(),
"start": seg.get("start", 0),
"end": seg.get("end", 0),
"words": []
}
# Add word-level data if available
words = seg.get("words", [])
for word_info in words:
segment_data["words"].append({
"word": word_info.get("word", ""),
"start": word_info.get("start", 0),
"end": word_info.get("end", 0),
"probability": word_info.get("probability", 0)
})
output["segments"].append(segment_data)
# Save JSON
json_path = output_dir / "transcript.json"
with open(json_path, "w") as f:
json.dump(output, f, indent=2)
print(f"[OK] Saved {json_path}")
# Save SRT
srt_path = output_dir / "transcript.srt"
with open(srt_path, "w") as f:
for i, seg in enumerate(segments, 1):
start_ts = format_timestamp(seg.get("start", 0))
end_ts = format_timestamp(seg.get("end", 0))
text = seg.get("text", "").strip()
f.write(f"{i}\n{start_ts} --> {end_ts}\n{text}\n\n")
print(f"[OK] Saved {srt_path}")
# Save TXT
txt_path = output_dir / "transcript.txt"
with open(txt_path, "w") as f:
for seg in segments:
f.write(seg.get("text", "").strip() + "\n")
print(f"[OK] Saved {txt_path}")
def main():
print("=" * 60)
print("Radio Show Batch Transcription - Mac M4 + mlx-whisper")
print("=" * 60)
print()
# Find episodes to process
episodes = sorted(EPISODES_DIR.glob("*.mp3"))
pending = [ep for ep in episodes if ep.stem not in COMPLETED]
print(f"[INFO] Found {len(episodes)} episodes, {len(pending)} pending")
print(f"[INFO] Model: {MODEL}")
print()
if not pending:
print("[OK] All episodes already transcribed!")
return
total_start = time.time()
completed = 0
failed = []
for i, episode in enumerate(pending, 1):
print(f"\n[{i}/{len(pending)}] {episode.name}")
print("-" * 40)
try:
result, duration = transcribe_episode(episode)
output_dir = TRANSCRIPTS_DIR / episode.stem
save_transcript(result, duration, output_dir)
completed += 1
except Exception as e:
print(f"[ERROR] Failed: {e}")
failed.append(episode.name)
# Summary
total_elapsed = time.time() - total_start
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"[OK] Completed: {completed}/{len(pending)}")
print(f"[INFO] Total time: {total_elapsed/60:.1f} minutes")
if failed:
print(f"[WARNING] Failed: {', '.join(failed)}")
print()
print("[SUCCESS] Batch transcription complete!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""
Quick test script to verify faster-whisper works on Mac M4.
Transcribes first 60 seconds of an episode.
"""
import time
from pathlib import Path
from faster_whisper import WhisperModel
from pydub import AudioSegment
# Config
EPISODE = Path("training-data/episodes/2011-06-04-hr1.mp3")
TEST_DURATION_MS = 60_000 # 60 seconds
MODEL_SIZE = "base" # Start small for testing, switch to large-v3 for production
def main():
print(f"[INFO] Loading {MODEL_SIZE} model on CPU...")
start = time.time()
# Use CPU - faster-whisper/ctranslate2 doesn't support MPS
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
print(f"[OK] Model loaded in {time.time() - start:.1f}s")
# Extract first 60 seconds
print(f"[INFO] Extracting first {TEST_DURATION_MS // 1000}s from {EPISODE.name}...")
audio = AudioSegment.from_mp3(str(EPISODE))
test_clip = audio[:TEST_DURATION_MS]
# Export to temp file
temp_file = Path("/tmp/test_clip.wav")
test_clip.export(str(temp_file), format="wav")
print(f"[OK] Test clip exported ({temp_file.stat().st_size // 1024}KB)")
# Transcribe
print("[INFO] Transcribing...")
start = time.time()
segments, info = model.transcribe(
str(temp_file),
language="en",
beam_size=5,
vad_filter=True,
)
# Collect segments
results = []
for seg in segments:
results.append({
"start": seg.start,
"end": seg.end,
"text": seg.text.strip()
})
elapsed = time.time() - start
print(f"[OK] Transcription complete in {elapsed:.1f}s")
print(f"[INFO] Speed: {TEST_DURATION_MS / 1000 / elapsed:.2f}x realtime")
print(f"[INFO] Segments: {len(results)}")
print()
print("=" * 60)
print("TRANSCRIPT:")
print("=" * 60)
for seg in results:
print(f"[{seg['start']:.1f}s - {seg['end']:.1f}s] {seg['text']}")
# Cleanup
temp_file.unlink()
print()
print("[SUCCESS] Test complete!")
if __name__ == "__main__":
main()