diff --git a/Bash/tools/transcribe_fw.py b/Bash/tools/transcribe_fw.py index eb4ea0b..48eff05 100644 --- a/Bash/tools/transcribe_fw.py +++ b/Bash/tools/transcribe_fw.py @@ -7,8 +7,6 @@ import sys import time from datetime import timedelta from typing import List, Optional - - def format_timestamp(seconds: float) -> str: td = timedelta(seconds=seconds) # Ensure SRT format HH:MM:SS,mmm @@ -162,6 +160,41 @@ def _kmeans_cosine(embs, k: int, iters: int = 50, seed: int = 0): return labels +def _ffmpeg_transcode_to_wav16_mono(src_path: str) -> Optional[str]: + """If ffmpeg is available, transcode input to a temporary 16k mono WAV and return its path.""" + if not shutil.which("ffmpeg"): + return None + import tempfile + tmp = tempfile.NamedTemporaryFile(prefix="fw_diar_", suffix=".wav", delete=False) + tmp_path = tmp.name + tmp.close() + # Run ffmpeg quietly + cmd = [ + "ffmpeg", + "-y", + "-v", + "error", + "-i", + src_path, + "-ac", + "1", + "-ar", + "16000", + "-f", + "wav", + tmp_path, + ] + try: + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return tmp_path + except Exception: + try: + os.unlink(tmp_path) + except Exception: + pass + return None + + def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Optional[list]: """Simple diarization: compute speaker embeddings per segment and cluster with KMeans. Returns a list of speaker labels aligned with segments, or None on failure. @@ -169,18 +202,33 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option try: import numpy as np import soundfile as sf - from speechbrain.pretrained import EncoderClassifier + # Use non-deprecated import path + from speechbrain.inference import EncoderClassifier import torch except Exception as e: print(f"[WARN] Diarization dependencies missing ({e}); skipping speaker labels.", file=sys.stderr) return None # Load audio + temp_to_cleanup: Optional[str] = None try: wav, sr = sf.read(audio_path, dtype="float32", always_2d=False) except Exception as e: - print(f"[WARN] Could not read audio for diarization: {e}", file=sys.stderr) - return None + # Try ffmpeg transcoding fallback + alt = _ffmpeg_transcode_to_wav16_mono(audio_path) + if alt is None: + print(f"[WARN] Could not read audio for diarization and no ffmpeg fallback available: {e}", file=sys.stderr) + return None + try: + wav, sr = sf.read(alt, dtype="float32", always_2d=False) + temp_to_cleanup = alt + except Exception as e2: + print(f"[WARN] Could not read transcoded audio for diarization: {e2}", file=sys.stderr) + try: + os.unlink(alt) + except Exception: + pass + return None if wav.ndim == 2: # mixdown wav = wav.mean(axis=1) # Resample to 16k for ECAPA @@ -191,10 +239,15 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option classifier = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device": "cpu"}, - savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa") + savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa"), ) except Exception as e: print(f"[WARN] Could not load speaker embedding model: {e}", file=sys.stderr) + if temp_to_cleanup: + try: + os.unlink(temp_to_cleanup) + except Exception: + pass return None embs = [] @@ -222,6 +275,11 @@ def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Option return None # Cluster labels = _kmeans_cosine(embs, k=max(1, int(num_speakers))) + if temp_to_cleanup: + try: + os.unlink(temp_to_cleanup) + except Exception: + pass return labels.tolist()