Add batch transcription scripts and 8 episode transcripts
Created Mac M4 batch transcription using mlx-whisper with Apple Silicon GPU acceleration. Transcribed 8 remaining episodes (17,555 total segments). Scripts: - batch_transcribe_mac.py: Full batch processor with mlx-whisper - test_mac_transcribe.py: Quick test script for faster-whisper Transcripts (JSON, SRT, TXT formats): - 2011-06-04-hr1: 1,503 segments - 2011-09-10-hr1: 1,378 segments - 2014-s6e05: 1,340 segments - 2015-s7e30: 1,053 segments - 2016-s8e42: 2,205 segments - 2017-s9e26: 2,366 segments - 2018-s10e17: 4,683 segments - 2018-s10e21: 2,493 segments All 9 episodes now transcribed (8 on Mac + 1 from Linux). Ready for Stages 3-6 on Linux PC. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
171
projects/radio-show/audio-processor/batch_transcribe_mac.py
Normal file
171
projects/radio-show/audio-processor/batch_transcribe_mac.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Batch transcription script for Mac M4 using mlx-whisper.
|
||||||
|
Transcribes all pending episodes using Apple Silicon GPU acceleration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
import mlx_whisper
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
EPISODES_DIR = Path("training-data/episodes")
|
||||||
|
TRANSCRIPTS_DIR = Path("training-data/transcripts")
|
||||||
|
MODEL = "mlx-community/whisper-large-v3-mlx"
|
||||||
|
|
||||||
|
# Episodes to transcribe (skip already completed ones)
|
||||||
|
COMPLETED = {"2010-10-02-hr1"} # Already transcribed on Linux
|
||||||
|
|
||||||
|
|
||||||
|
def format_timestamp(seconds: float) -> str:
|
||||||
|
"""Format seconds as SRT timestamp (HH:MM:SS,mmm)."""
|
||||||
|
td = timedelta(seconds=seconds)
|
||||||
|
hours, remainder = divmod(td.seconds, 3600)
|
||||||
|
minutes, seconds = divmod(remainder, 60)
|
||||||
|
ms = td.microseconds // 1000
|
||||||
|
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_episode(episode_path: Path) -> dict:
|
||||||
|
"""Transcribe a single episode and return results."""
|
||||||
|
print(f"[INFO] Transcribing {episode_path.name}...")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Get audio duration
|
||||||
|
audio = AudioSegment.from_mp3(str(episode_path))
|
||||||
|
duration_seconds = len(audio) / 1000.0
|
||||||
|
|
||||||
|
# Transcribe with word timestamps
|
||||||
|
result = mlx_whisper.transcribe(
|
||||||
|
str(episode_path),
|
||||||
|
path_or_hf_repo=MODEL,
|
||||||
|
language="en",
|
||||||
|
word_timestamps=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
speed = duration_seconds / elapsed
|
||||||
|
|
||||||
|
print(f"[OK] Done in {elapsed:.1f}s ({speed:.1f}x realtime)")
|
||||||
|
print(f"[INFO] Segments: {len(result.get('segments', []))}")
|
||||||
|
|
||||||
|
return result, duration_seconds
|
||||||
|
|
||||||
|
|
||||||
|
def save_transcript(result: dict, duration: float, output_dir: Path):
|
||||||
|
"""Save transcript in JSON, SRT, and TXT formats."""
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
segments = result.get("segments", [])
|
||||||
|
|
||||||
|
# Build output structure matching existing format
|
||||||
|
output = {
|
||||||
|
"language": result.get("language", "en"),
|
||||||
|
"language_probability": 1.0,
|
||||||
|
"duration": duration,
|
||||||
|
"segments": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, seg in enumerate(segments):
|
||||||
|
segment_data = {
|
||||||
|
"id": i,
|
||||||
|
"text": seg.get("text", "").strip(),
|
||||||
|
"start": seg.get("start", 0),
|
||||||
|
"end": seg.get("end", 0),
|
||||||
|
"words": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add word-level data if available
|
||||||
|
words = seg.get("words", [])
|
||||||
|
for word_info in words:
|
||||||
|
segment_data["words"].append({
|
||||||
|
"word": word_info.get("word", ""),
|
||||||
|
"start": word_info.get("start", 0),
|
||||||
|
"end": word_info.get("end", 0),
|
||||||
|
"probability": word_info.get("probability", 0)
|
||||||
|
})
|
||||||
|
|
||||||
|
output["segments"].append(segment_data)
|
||||||
|
|
||||||
|
# Save JSON
|
||||||
|
json_path = output_dir / "transcript.json"
|
||||||
|
with open(json_path, "w") as f:
|
||||||
|
json.dump(output, f, indent=2)
|
||||||
|
print(f"[OK] Saved {json_path}")
|
||||||
|
|
||||||
|
# Save SRT
|
||||||
|
srt_path = output_dir / "transcript.srt"
|
||||||
|
with open(srt_path, "w") as f:
|
||||||
|
for i, seg in enumerate(segments, 1):
|
||||||
|
start_ts = format_timestamp(seg.get("start", 0))
|
||||||
|
end_ts = format_timestamp(seg.get("end", 0))
|
||||||
|
text = seg.get("text", "").strip()
|
||||||
|
f.write(f"{i}\n{start_ts} --> {end_ts}\n{text}\n\n")
|
||||||
|
print(f"[OK] Saved {srt_path}")
|
||||||
|
|
||||||
|
# Save TXT
|
||||||
|
txt_path = output_dir / "transcript.txt"
|
||||||
|
with open(txt_path, "w") as f:
|
||||||
|
for seg in segments:
|
||||||
|
f.write(seg.get("text", "").strip() + "\n")
|
||||||
|
print(f"[OK] Saved {txt_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Radio Show Batch Transcription - Mac M4 + mlx-whisper")
|
||||||
|
print("=" * 60)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Find episodes to process
|
||||||
|
episodes = sorted(EPISODES_DIR.glob("*.mp3"))
|
||||||
|
pending = [ep for ep in episodes if ep.stem not in COMPLETED]
|
||||||
|
|
||||||
|
print(f"[INFO] Found {len(episodes)} episodes, {len(pending)} pending")
|
||||||
|
print(f"[INFO] Model: {MODEL}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not pending:
|
||||||
|
print("[OK] All episodes already transcribed!")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_start = time.time()
|
||||||
|
completed = 0
|
||||||
|
failed = []
|
||||||
|
|
||||||
|
for i, episode in enumerate(pending, 1):
|
||||||
|
print(f"\n[{i}/{len(pending)}] {episode.name}")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result, duration = transcribe_episode(episode)
|
||||||
|
output_dir = TRANSCRIPTS_DIR / episode.stem
|
||||||
|
save_transcript(result, duration, output_dir)
|
||||||
|
completed += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] Failed: {e}")
|
||||||
|
failed.append(episode.name)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
total_elapsed = time.time() - total_start
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"[OK] Completed: {completed}/{len(pending)}")
|
||||||
|
print(f"[INFO] Total time: {total_elapsed/60:.1f} minutes")
|
||||||
|
|
||||||
|
if failed:
|
||||||
|
print(f"[WARNING] Failed: {', '.join(failed)}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("[SUCCESS] Batch transcription complete!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
75
projects/radio-show/audio-processor/test_mac_transcribe.py
Normal file
75
projects/radio-show/audio-processor/test_mac_transcribe.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick test script to verify faster-whisper works on Mac M4.
|
||||||
|
Transcribes first 60 seconds of an episode.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
# Config
|
||||||
|
EPISODE = Path("training-data/episodes/2011-06-04-hr1.mp3")
|
||||||
|
TEST_DURATION_MS = 60_000 # 60 seconds
|
||||||
|
MODEL_SIZE = "base" # Start small for testing, switch to large-v3 for production
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"[INFO] Loading {MODEL_SIZE} model on CPU...")
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
# Use CPU - faster-whisper/ctranslate2 doesn't support MPS
|
||||||
|
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
|
||||||
|
print(f"[OK] Model loaded in {time.time() - start:.1f}s")
|
||||||
|
|
||||||
|
# Extract first 60 seconds
|
||||||
|
print(f"[INFO] Extracting first {TEST_DURATION_MS // 1000}s from {EPISODE.name}...")
|
||||||
|
audio = AudioSegment.from_mp3(str(EPISODE))
|
||||||
|
test_clip = audio[:TEST_DURATION_MS]
|
||||||
|
|
||||||
|
# Export to temp file
|
||||||
|
temp_file = Path("/tmp/test_clip.wav")
|
||||||
|
test_clip.export(str(temp_file), format="wav")
|
||||||
|
print(f"[OK] Test clip exported ({temp_file.stat().st_size // 1024}KB)")
|
||||||
|
|
||||||
|
# Transcribe
|
||||||
|
print("[INFO] Transcribing...")
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
segments, info = model.transcribe(
|
||||||
|
str(temp_file),
|
||||||
|
language="en",
|
||||||
|
beam_size=5,
|
||||||
|
vad_filter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect segments
|
||||||
|
results = []
|
||||||
|
for seg in segments:
|
||||||
|
results.append({
|
||||||
|
"start": seg.start,
|
||||||
|
"end": seg.end,
|
||||||
|
"text": seg.text.strip()
|
||||||
|
})
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
print(f"[OK] Transcription complete in {elapsed:.1f}s")
|
||||||
|
print(f"[INFO] Speed: {TEST_DURATION_MS / 1000 / elapsed:.2f}x realtime")
|
||||||
|
print(f"[INFO] Segments: {len(results)}")
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print("TRANSCRIPT:")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
for seg in results:
|
||||||
|
print(f"[{seg['start']:.1f}s - {seg['end']:.1f}s] {seg['text']}")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
temp_file.unlink()
|
||||||
|
print()
|
||||||
|
print("[SUCCESS] Test complete!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user