feat: automatically change audio to supporet diarization

This commit is contained in:
Krzysztof kuhy Rudnicki 2025-10-12 14:51:41 +02:00
parent 60517b4584
commit 06c8121c79

View File

@ -7,8 +7,6 @@ import sys
import time
from datetime import timedelta
from typing import List, Optional
def format_timestamp(seconds: float) -> str:
td = timedelta(seconds=seconds)
# Ensure SRT format HH:MM:SS,mmm
@ -162,6 +160,41 @@ def _kmeans_cosine(embs, k: int, iters: int = 50, seed: int = 0):
return labels
def _ffmpeg_transcode_to_wav16_mono(src_path: str) -> Optional[str]:
"""If ffmpeg is available, transcode input to a temporary 16k mono WAV and return its path."""
if not shutil.which("ffmpeg"):
return None
import tempfile
tmp = tempfile.NamedTemporaryFile(prefix="fw_diar_", suffix=".wav", delete=False)
tmp_path = tmp.name
tmp.close()
# Run ffmpeg quietly
cmd = [
"ffmpeg",
"-y",
"-v",
"error",
"-i",
src_path,
"-ac",
"1",
"-ar",
"16000",
"-f",
"wav",
tmp_path,
]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return tmp_path
except Exception:
try:
os.unlink(tmp_path)
except Exception:
pass
return None
def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Optional[list]:
"""Simple diarization: compute speaker embeddings per segment and cluster with KMeans.
Returns a list of speaker labels aligned with segments, or None on failure.
@ -169,18 +202,33 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option
try:
import numpy as np
import soundfile as sf
from speechbrain.pretrained import EncoderClassifier
# Use non-deprecated import path
from speechbrain.inference import EncoderClassifier
import torch
except Exception as e:
print(f"[WARN] Diarization dependencies missing ({e}); skipping speaker labels.", file=sys.stderr)
return None
# Load audio
temp_to_cleanup: Optional[str] = None
try:
wav, sr = sf.read(audio_path, dtype="float32", always_2d=False)
except Exception as e:
print(f"[WARN] Could not read audio for diarization: {e}", file=sys.stderr)
return None
# Try ffmpeg transcoding fallback
alt = _ffmpeg_transcode_to_wav16_mono(audio_path)
if alt is None:
print(f"[WARN] Could not read audio for diarization and no ffmpeg fallback available: {e}", file=sys.stderr)
return None
try:
wav, sr = sf.read(alt, dtype="float32", always_2d=False)
temp_to_cleanup = alt
except Exception as e2:
print(f"[WARN] Could not read transcoded audio for diarization: {e2}", file=sys.stderr)
try:
os.unlink(alt)
except Exception:
pass
return None
if wav.ndim == 2: # mixdown
wav = wav.mean(axis=1)
# Resample to 16k for ECAPA
@ -191,10 +239,15 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option
classifier = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
run_opts={"device": "cpu"},
savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa")
savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa"),
)
except Exception as e:
print(f"[WARN] Could not load speaker embedding model: {e}", file=sys.stderr)
if temp_to_cleanup:
try:
os.unlink(temp_to_cleanup)
except Exception:
pass
return None
embs = []
@ -222,6 +275,11 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option
return None
# Cluster
labels = _kmeans_cosine(embs, k=max(1, int(num_speakers)))
if temp_to_cleanup:
try:
os.unlink(temp_to_cleanup)
except Exception:
pass
return labels.tolist()