radio show: co-host voice profile, Q&A extraction fixes, archive index
- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -113,61 +113,60 @@ def _format_srt_time(seconds: float) -> str:
|
||||
|
||||
|
||||
def transcribe(audio_path: str | Path, model_size: str = "large-v3",
|
||||
language: str = "en", device: str = "cuda") -> Transcript:
|
||||
"""Transcribe an audio file using faster-whisper."""
|
||||
from faster_whisper import WhisperModel
|
||||
language: str = "en", device: str = "cuda",
|
||||
batch_size: int = 16) -> Transcript:
|
||||
"""Transcribe an audio file using faster-whisper.
|
||||
|
||||
Uses BatchedInferencePipeline + int8_float16 + VAD for archive/batch work.
|
||||
Word timestamps are skipped in batch mode (not needed for segment-level search).
|
||||
Pass batch_size=0 to fall back to sequential WhisperModel with word timestamps.
|
||||
"""
|
||||
from faster_whisper import WhisperModel, BatchedInferencePipeline
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
use_batched = batch_size > 0
|
||||
|
||||
console.print(f"[bold]Transcribing:[/bold] {audio_path.name}")
|
||||
console.print(f"[dim]Model: {model_size}, Device: {device}[/dim]")
|
||||
|
||||
model = WhisperModel(model_size, device=device, compute_type="float16")
|
||||
|
||||
segments_raw, info = model.transcribe(
|
||||
str(audio_path),
|
||||
language=language,
|
||||
word_timestamps=True,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500,
|
||||
speech_pad_ms=200,
|
||||
),
|
||||
console.print(
|
||||
f"[dim]Model: {model_size} | "
|
||||
f"{'batched x' + str(batch_size) + ' int8_float16' if use_batched else 'sequential float16'} | "
|
||||
f"Device: {device}[/dim]"
|
||||
)
|
||||
|
||||
console.print(f"[dim]Detected language: {info.language} "
|
||||
f"(probability: {info.language_probability:.2f})[/dim]")
|
||||
console.print(f"[dim]Duration: {info.duration:.1f}s "
|
||||
f"({info.duration / 60:.1f} min)[/dim]")
|
||||
if use_batched:
|
||||
base_model = WhisperModel(model_size, device=device, compute_type="int8_float16")
|
||||
model = BatchedInferencePipeline(model=base_model)
|
||||
segments_raw, info = model.transcribe(
|
||||
str(audio_path),
|
||||
language=language,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
else:
|
||||
model = WhisperModel(model_size, device=device, compute_type="float16")
|
||||
segments_raw, info = model.transcribe(
|
||||
str(audio_path),
|
||||
language=language,
|
||||
word_timestamps=True,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
|
||||
)
|
||||
|
||||
console.print(f"[dim]Duration: {info.duration:.1f}s ({info.duration / 60:.1f} min)[/dim]")
|
||||
|
||||
segments = []
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed} segments"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Transcribing...", total=None)
|
||||
|
||||
for i, seg in enumerate(segments_raw):
|
||||
for i, seg in enumerate(segments_raw):
|
||||
words = []
|
||||
if not use_batched:
|
||||
words = [
|
||||
TranscriptWord(
|
||||
word=w.word,
|
||||
start=w.start,
|
||||
end=w.end,
|
||||
probability=w.probability,
|
||||
)
|
||||
TranscriptWord(word=w.word, start=w.start,
|
||||
end=w.end, probability=w.probability)
|
||||
for w in (seg.words or [])
|
||||
]
|
||||
segments.append(TranscriptSegment(
|
||||
id=i,
|
||||
text=seg.text,
|
||||
start=seg.start,
|
||||
end=seg.end,
|
||||
words=words,
|
||||
))
|
||||
progress.update(task, completed=i + 1)
|
||||
segments.append(TranscriptSegment(
|
||||
id=i, text=seg.text, start=seg.start, end=seg.end, words=words,
|
||||
))
|
||||
if i % 50 == 0:
|
||||
console.print(f"[dim] {i} segments... ({seg.end:.0f}s)[/dim]")
|
||||
|
||||
console.print(f"[green]Transcription complete: {len(segments)} segments[/green]")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user