diff --git a/Bash/.gitignore b/Bash/.gitignore index 4750368..50fbb0c 100644 --- a/Bash/.gitignore +++ b/Bash/.gitignore @@ -5,4 +5,5 @@ *.ogg* *.wav* *.m4a* -main_folder \ No newline at end of file +main_folder +models diff --git a/Bash/.vscode/tasks.json b/Bash/.vscode/tasks.json new file mode 100644 index 0000000..3c4dc0e --- /dev/null +++ b/Bash/.vscode/tasks.json @@ -0,0 +1,21 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Transcribe tiny online smoke test", + "type": "shell", + "command": "bash", + "args": [ + "/home/kuhy/testsAndMisc/Bash/transcribe.sh", + "--online", + "-m", + "tiny" + ], + "isBackground": false, + "problemMatcher": [ + "$gcc" + ], + "group": "build" + } + ] +} \ No newline at end of file diff --git a/Bash/test_fw.srt b/Bash/test_fw.srt new file mode 100644 index 0000000..b9d1a04 --- /dev/null +++ b/Bash/test_fw.srt @@ -0,0 +1,4 @@ +1 +00:00:00,000 --> 00:00:02,760 +This is a quick test on faster with but run creep shun. + diff --git a/Bash/tools/transcribe_fw.py b/Bash/tools/transcribe_fw.py new file mode 100644 index 0000000..eb4ea0b --- /dev/null +++ b/Bash/tools/transcribe_fw.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +import argparse +import os +import shutil +import subprocess +import sys +import time +from datetime import timedelta +from typing import List, Optional + + +def format_timestamp(seconds: float) -> str: + td = timedelta(seconds=seconds) + # Ensure SRT format HH:MM:SS,mmm + total_seconds = int(td.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + secs = total_seconds % 60 + millis = int((seconds - int(seconds)) * 1000) + return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" + + +def write_srt(segments, srt_path: str): + with open(srt_path, "w", encoding="utf-8") as f: + for i, seg in enumerate(segments, start=1): + start = format_timestamp(seg.start) + end = format_timestamp(seg.end) + text = (seg.text or "").strip() + if not text: + continue + f.write(f"{i}\n{start} --> {end}\n{text}\n\n") + + +def write_txt(segments, txt_path: str): + with open(txt_path, "w", encoding="utf-8") as f: + for seg in segments: + text = (seg.text or "").strip() + if text: + f.write(text + "\n") + + +def write_srt_with_speakers(segments, labels: List[int], path: str): + with open(path, "w", encoding="utf-8") as f: + for i, (seg, lab) in enumerate(zip(segments, labels), start=1): + text = (seg.text or "").strip() + if not text: + continue + spk = f"SPK{lab+1}" + f.write(f"{i}\n{format_timestamp(seg.start)} --> {format_timestamp(seg.end)}\n[{spk}] {text}\n\n") + + +def write_txt_with_speakers(segments, labels: List[int], path: str): + with open(path, "w", encoding="utf-8") as f: + for seg, lab in zip(segments, labels): + text = (seg.text or "").strip() + if text: + spk = f"SPK{lab+1}" + f.write(f"[{spk}] {text}\n") + + +def write_rttm(segments, labels: List[int], path: str, file_id: str = "audio"): + # RTTM format: SPEAKER 1 + with open(path, "w", encoding="utf-8") as f: + for seg, lab in zip(segments, labels): + start = float(getattr(seg, "start", 0.0) or 0.0) + end = float(getattr(seg, "end", start) or start) + dur = max(0.0, end - start) + name = f"SPK{lab+1}" + f.write(f"SPEAKER {file_id} 1 {start:.3f} {dur:.3f} {name} \n") + + +def hhmmss(seconds: float) -> str: + seconds = max(0.0, float(seconds)) + total_seconds = int(seconds) + h = total_seconds // 3600 + m = (total_seconds % 3600) // 60 + s = total_seconds % 60 + return f"{h:02d}:{m:02d}:{s:02d}" + + +def get_media_duration(path: str) -> float | None: + """Try to get media duration in seconds using ffmpeg-python or ffprobe. + Returns None if unavailable. + """ + # Try ffmpeg-python first (if installed) which uses ffprobe under the hood + try: + import ffmpeg # type: ignore + + probe = ffmpeg.probe(path) + fmt = probe.get("format", {}) + if "duration" in fmt: + return float(fmt["duration"]) # type: ignore + except Exception: + pass + + # Fallback: call ffprobe directly if available + if shutil.which("ffprobe"): + try: + out = subprocess.check_output( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + path, + ], + stderr=subprocess.DEVNULL, + ) + return float(out.decode().strip()) + except Exception: + return None + return None + + +def _resample_linear(x, src_sr: int, tgt_sr: int): + import numpy as np + if src_sr == tgt_sr: + return x + ratio = float(tgt_sr) / float(src_sr) + n_out = max(1, int(round(x.shape[-1] * ratio))) + xp = np.linspace(0.0, 1.0, num=x.shape[-1], endpoint=False) + xq = np.linspace(0.0, 1.0, num=n_out, endpoint=False) + y = np.interp(xq, xp, x.astype(np.float32)) + return y.astype(np.float32) + + +def _kmeans_cosine(embs, k: int, iters: int = 50, seed: int = 0): + import numpy as np + rng = np.random.default_rng(seed) + X = np.asarray(embs, dtype=np.float32) + if X.ndim != 2 or X.shape[0] == 0: + return np.zeros((0,), dtype=np.int64) + # Normalize + X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8) + # Init centroids as random samples + idxs = rng.choice(X.shape[0], size=min(k, X.shape[0]), replace=False) + C = X[idxs] + # If fewer samples than k, pad with random + if C.shape[0] < k: + pad = rng.standard_normal(size=(k - C.shape[0], X.shape[1])).astype(np.float32) + pad /= (np.linalg.norm(pad, axis=1, keepdims=True) + 1e-8) + C = np.concatenate([C, pad], axis=0) + for _ in range(iters): + # Assign by cosine similarity (maximize dot product) + sims = X @ C.T # (n, k) + labels = sims.argmax(axis=1) + newC = np.zeros_like(C) + for j in range(k): + sel = X[labels == j] + if sel.shape[0] == 0: + newC[j] = C[j] + else: + v = sel.mean(axis=0) + v /= (np.linalg.norm(v) + 1e-8) + newC[j] = v + if np.allclose(newC, C, atol=1e-4): + break + C = newC + return labels + + +def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Optional[list]: + """Simple diarization: compute speaker embeddings per segment and cluster with KMeans. + Returns a list of speaker labels aligned with segments, or None on failure. + """ + try: + import numpy as np + import soundfile as sf + from speechbrain.pretrained import EncoderClassifier + import torch + except Exception as e: + print(f"[WARN] Diarization dependencies missing ({e}); skipping speaker labels.", file=sys.stderr) + return None + + # Load audio + try: + wav, sr = sf.read(audio_path, dtype="float32", always_2d=False) + except Exception as e: + print(f"[WARN] Could not read audio for diarization: {e}", file=sys.stderr) + return None + if wav.ndim == 2: # mixdown + wav = wav.mean(axis=1) + # Resample to 16k for ECAPA + wav16 = _resample_linear(wav, sr, 16000) + + # Load speaker embedding model (CPU is fine) + try: + classifier = EncoderClassifier.from_hparams( + source="speechbrain/spkrec-ecapa-voxceleb", + run_opts={"device": "cpu"}, + savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa") + ) + except Exception as e: + print(f"[WARN] Could not load speaker embedding model: {e}", file=sys.stderr) + return None + + embs = [] + # Extract embedding per segment window + for seg in segments: + s = float(getattr(seg, "start", 0.0) or 0.0) + e = float(getattr(seg, "end", s) or s) + if e <= s: + e = s + 0.2 # minimal window + # Convert to samples in 16k + i0 = int(s * 16000) + i1 = int(e * 16000) + # Add small margins to help very short segments + pad = int(0.05 * 16000) + i0 = max(0, i0 - pad) + i1 = min(len(wav16), i1 + pad) + if i1 - i0 < 1600: # <0.1s, too short; expand if possible + i1 = min(len(wav16), i0 + 1600) + segment_wav = torch.tensor(wav16[i0:i1]).unsqueeze(0) + with torch.no_grad(): + emb = classifier.encode_batch(segment_wav).squeeze(0).squeeze(0).cpu().numpy() + embs.append(emb.astype("float32")) + + if len(embs) == 0: + return None + # Cluster + labels = _kmeans_cosine(embs, k=max(1, int(num_speakers))) + return labels.tolist() + + +def main(): + parser = argparse.ArgumentParser(description="Transcribe audio with faster-whisper and write .txt and .srt") + parser.add_argument("input", help="Path to audio/video file") + parser.add_argument("--model", default=os.environ.get("FW_MODEL", "large-v3"), help="Model size or path (default: large-v3)") + parser.add_argument("--language", default=None, help="Language code (e.g., en). Leave None for auto-detect") + parser.add_argument("--device", default=os.environ.get("FW_DEVICE", "auto"), choices=["auto", "cpu", "cuda"], help="Device to run on") + parser.add_argument("--compute-type", dest="compute_type", default=os.environ.get("FW_COMPUTE", "auto"), help="Compute type (auto,int8,float16,float32,int8_float16,etc.)") + parser.add_argument("--outdir", default=None, help="Output directory (default: next to input)") + parser.add_argument("--no-progress", action="store_true", help="Disable live progress output") + parser.add_argument("--diarize", action="store_true", help="Enable speaker diarization (labels)") + parser.add_argument("--num-speakers", type=int, default=int(os.environ.get("FW_NUM_SPEAKERS", "2")), help="Assumed number of speakers (default: 2)") + args = parser.parse_args() + + try: + from faster_whisper import WhisperModel + except Exception as e: + print("[ERROR] faster-whisper is not installed in this environment.", file=sys.stderr) + print(str(e), file=sys.stderr) + return 2 + + inp = os.path.abspath(args.input) + if not os.path.exists(inp): + print(f"[ERROR] Input file not found: {inp}", file=sys.stderr) + return 2 + + outdir = os.path.abspath(args.outdir or os.path.dirname(inp) or ".") + os.makedirs(outdir, exist_ok=True) + base = os.path.splitext(os.path.basename(inp))[0] + srt_path = os.path.join(outdir, base + ".srt") + txt_path = os.path.join(outdir, base + ".txt") + + # Device and compute_type heuristics + device = args.device + compute_type = args.compute_type + if device == "auto": + device = "cpu" + if compute_type == "auto": + # Prefer accuracy over speed by default + compute_type = "float16" if device == "cuda" else "float32" + + print(f"[INFO] Loading model='{args.model}', device='{device}', compute_type='{compute_type}'") + model = WhisperModel(args.model, device=device, compute_type=compute_type) + + # Transcription with live progress + total_duration = get_media_duration(inp) + if total_duration: + print(f"[INFO] Media duration: {hhmmss(total_duration)}") + start_ts = time.time() + + iter_segments, info = model.transcribe(inp, language=args.language) + collected = [] + processed = 0.0 + last_print = 0.0 + tty = sys.stderr.isatty() + for seg in iter_segments: + collected.append(seg) + # Update processed time from segment end if available + if getattr(seg, "end", None) is not None: + processed = max(processed, float(seg.end)) + now = time.time() + # Print each segment or throttle to ~5 per second + if not args.no_progress and (tty or (now - last_print) >= 0.2): + last_print = now + if total_duration and total_duration > 0: + pct = max(0.0, min(100.0, (processed / total_duration) * 100.0)) + elapsed = now - start_ts + eta = None + if processed > 0: + rate = processed / max(1e-6, elapsed) + remaining = max(0.0, total_duration - processed) + eta = remaining / max(1e-6, rate) + line = f"[PROGRESS] {hhmmss(processed)} / {hhmmss(total_duration)} ({pct:5.1f}%)" + if eta is not None and eta < 60 * 60 * 24: # cap unrealistic values + line += f" ETA ~{hhmmss(eta)}" + else: + line = f"[PROGRESS] processed {hhmmss(processed)}" + if tty: + print("\r" + line, end="", file=sys.stderr, flush=True) + else: + print(line, file=sys.stderr, flush=True) + + # Finish progress line + if not args.no_progress and sys.stderr.isatty(): + print("", file=sys.stderr) # newline + + print(f"[INFO] Detected language: {getattr(info, 'language', None)} (prob={getattr(info, 'language_probability', None)})") + print(f"[INFO] Segments: {len(collected)}") + + # Optionally diarize + if args.diarize: + labels = diarize_segments(inp, collected, num_speakers=args.num_speakers) + if labels is not None and len(labels) == len(collected): + diar_srt = os.path.join(outdir, base + ".diar.srt") + diar_txt = os.path.join(outdir, base + ".diar.txt") + rttm_path = os.path.join(outdir, base + ".rttm") + write_srt_with_speakers(collected, labels, diar_srt) + write_txt_with_speakers(collected, labels, diar_txt) + write_rttm(collected, labels, rttm_path, file_id=base) + print(f"[OK] Wrote: {diar_txt}\n[OK] Wrote: {diar_srt}\n[OK] Wrote: {rttm_path}") + else: + print("[WARN] Diarization failed or returned mismatched labels; writing plain outputs.", file=sys.stderr) + + # Write base outputs + write_txt(collected, txt_path) + write_srt(collected, srt_path) + print(f"[OK] Wrote: {txt_path}\n[OK] Wrote: {srt_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/Bash/transcribe.sh b/Bash/transcribe.sh new file mode 100644 index 0000000..e1cce85 --- /dev/null +++ b/Bash/transcribe.sh @@ -0,0 +1,430 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Transcribe an audio file using faster-whisper with automatic setup. +# - Creates Python venv in .venv +# - Installs ffmpeg and espeak-ng (best-effort) for test audio generation +# - Installs faster-whisper (and CUDA stack if NVIDIA is present) +# - Runs tools/transcribe_fw.py to produce .txt and .srt next to the input + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$SCRIPT_DIR" +TOOLS_DIR="$PROJECT_DIR/tools" +PY_RUNNER="$TOOLS_DIR/transcribe_fw.py" +VENV_DIR="$PROJECT_DIR/.venv" + +usage() { + cat </dev/null 2>&1; then echo apt; return; fi + if command -v dnf >/dev/null 2>&1; then echo dnf; return; fi + if command -v yum >/dev/null 2>&1; then echo yum; return; fi + if command -v pacman >/dev/null 2>&1; then echo pacman; return; fi + if command -v zypper >/dev/null 2>&1; then echo zypper; return; fi + echo none +} + +has_libcublas12() { + # Common system locations + for d in \ + /usr/lib \ + /usr/lib64 \ + /usr/local/cuda/lib64 \ + /usr/local/cuda-12*/lib64 \ + /opt/cuda/lib64 \ + /opt/cuda/targets/x86_64-linux/lib; do + [[ -e "$d/libcublas.so.12" ]] && return 0 || true + done + # venv-provided NVIDIA CUDA libs + if [[ -x "$VENV_DIR/bin/python" ]]; then + local pyver + pyver="$($VENV_DIR/bin/python -c 'import sys;print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)" + if [[ -n "$pyver" ]]; then + for d in "$VENV_DIR/lib/python$pyver/site-packages/nvidia/cublas/lib" \ + "$VENV_DIR/lib/python$pyver/site-packages/nvidia/cudnn/lib" \ + "$VENV_DIR/lib/python$pyver/site-packages/nvidia/cuda_runtime/lib"; do + [[ -e "$d/libcublas.so.12" ]] && return 0 || true + done + fi + fi + return 1 +} + +ensure_cuda_runtime() { + local mgr; mgr="$(detect_pkg_mgr)" + if [[ $OFFLINE -eq 1 ]]; then + if has_libcublas12; then return 0; fi + echo "CUDA runtime (libcublas.so.12) not found and offline mode is enabled. Install CUDA 12 runtime or rerun with --online." >&2 + exit 6 + fi + if has_libcublas12; then + return 0 + fi + if ! command -v sudo >/dev/null 2>&1; then + log "sudo not found; skipping CUDA runtime install attempt." + else + log "CUDA cuBLAS 12 not found; attempting to install CUDA runtime (manager: $mgr)" + set +e + case "$mgr" in + pacman) + sudo pacman -Sy --noconfirm cuda cudnn || true ;; + apt) + sudo apt-get update -y || true + sudo apt-get install -y nvidia-cuda-toolkit || true ;; + dnf|yum) + sudo "$mgr" install -y cuda cudnn || true ;; + zypper) + sudo zypper install -y cuda cudnn || true ;; + *) log "Unknown package manager; cannot install CUDA automatically." ;; + esac + set -e + fi + # Re-check + if ! has_libcublas12; then + echo "CUDA runtime (libcublas.so.12) not found after attempted install. Please install CUDA 12 toolkit/runtime and re-run." >&2 + exit 6 + fi +} + +install_system_deps() { + have_cmd() { command -v "$1" >/dev/null 2>&1; } + local need_ffmpeg=0 need_espeak=0 + have_cmd ffmpeg || need_ffmpeg=1 + have_cmd espeak-ng || need_espeak=1 + + # If diarization requested and online, we may also try to ensure libsndfile + local need_libsndfile=0 + if [[ "${FW_DIARIZE:-}" == "1" ]]; then + # Heuristic: check common library file + if [[ ! -e /usr/lib/x86_64-linux-gnu/libsndfile.so && ! -e /usr/lib/libsndfile.so && ! -e /usr/lib64/libsndfile.so ]]; then + need_libsndfile=1 + fi + fi + + if [[ $need_ffmpeg -eq 0 && $need_espeak -eq 0 && $need_libsndfile -eq 0 ]]; then + log "System deps present: ffmpeg, espeak-ng${FW_DIARIZE:+, libsndfile}" + return 0 + fi + + if [[ $OFFLINE -eq 1 ]]; then + echo "Missing system dependencies (ffmpeg/espeak-ng) but running in offline mode. Install them or rerun with --online." >&2 + exit 5 + fi + + local mgr; mgr="$(detect_pkg_mgr)" + log "Detected package manager: $mgr (installing missing: $([[ $need_ffmpeg -eq 1 ]] && echo ffmpeg )$([[ $need_espeak -eq 1 ]] && echo espeak-ng )$([[ $need_libsndfile -eq 1 ]] && echo libsndfile))" + + if ! command -v sudo >/dev/null 2>&1; then + log "sudo not found; skipping system package installation attempt." + return 0 + fi + + # Avoid exiting on install errors; continue best-effort + set +e + case "$mgr" in + apt) + sudo apt-get update -y || log "apt-get update failed; continuing" + pkgs=(python3-venv python3-pip) + [[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg) + [[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng) + if [[ $need_libsndfile -eq 1 ]]; then + # Try both names across releases + pkgs+=(libsndfile1) + sudo apt-get install -y libsndfile1 || true + # If that failed, try libsndfile2 (newer distros) + sudo apt-get install -y libsndfile2 || true + fi + sudo apt-get install -y "${pkgs[@]}" || log "apt-get install failed; continuing" ;; + dnf) + pkgs=(python3-venv python3-pip) + [[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg) + [[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng) + [[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile) + sudo dnf install -y "${pkgs[@]}" || log "dnf install failed; continuing" ;; + yum) + pkgs=(python3-venv python3-pip) + [[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg) + [[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng) + [[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile) + sudo yum install -y "${pkgs[@]}" || log "yum install failed; continuing" ;; + pacman) + pkgs=(python-virtualenv python-pip) + [[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg) + [[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng) + [[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile) + sudo pacman -Sy --noconfirm "${pkgs[@]}" || log "pacman install failed; continuing" ;; + zypper) + pkgs=(python311-virtualenv python311-pip) + [[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg) + [[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng) + [[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile1) + sudo zypper install -y "${pkgs[@]}" || log "zypper install failed; continuing" ;; + *) + log "Unknown package manager; please ensure ffmpeg and espeak-ng are installed." ;; + esac + set -e +} + +setup_venv() { + if [[ ! -d "$VENV_DIR" ]]; then + log "Creating venv at $VENV_DIR" + python3 -m venv "$VENV_DIR" + fi + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + if [[ $OFFLINE -eq 0 ]]; then + python -m pip install --upgrade pip wheel setuptools + fi +} + +install_python_deps() { + # Install deps; if NVIDIA GPU is present, prefer CUDA-capable stack (cu12) + local has_nvidia_flag="${1:-0}" + log "Installing faster-whisper and dependencies" + export PIP_DISABLE_PIP_VERSION_CHECK=1 + export PIP_DEFAULT_TIMEOUT=${PIP_DEFAULT_TIMEOUT:-20} + if [[ $OFFLINE -eq 1 ]]; then + # Offline: do not install, just verify modules + if ! python -c 'import faster_whisper' >/dev/null 2>&1; then + echo "Python dependency 'faster_whisper' not found in offline mode. Run with --online to install." >&2 + exit 7 + fi + # If diarization requested offline, check for its deps too (warn-only) + if [[ "${FW_DIARIZE:-}" == "1" ]]; then + python - <<'PY' || true +try: + import soundfile, speechbrain, torch # noqa: F401 +except Exception as e: + print(f"[WARN] Diarization deps missing offline ({e}); speaker labels will be skipped.") +PY + fi + return 0 + fi + if [[ "$has_nvidia_flag" -eq 1 ]]; then + # If ctranslate2 is not installed, attempt CUDA-enabled wheel (quiet, with fallback) + if ! "$VENV_DIR/bin/python" -c 'import ctranslate2' >/dev/null 2>&1; then + log "Installing CUDA-enabled CTranslate2 (cu12 wheel)" + python -m pip install -q --retries 1 --upgrade "ctranslate2<5,>=4.0" --extra-index-url https://download.opennmt.net/ctranslate2/cu12 || \ + log "Warning: could not reach cu12 wheel index; will proceed with available ctranslate2" + fi + # Ensure NVIDIA CUDA 12 runtime libs are available inside the venv + python -m pip install -q --retries 1 --upgrade nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 || \ + log "Warning: failed to install NVIDIA cu12 runtime libs via pip" + fi + python -m pip install -q --retries 1 --upgrade faster-whisper ffmpeg-python + + # If diarization requested and online, install its Python deps best-effort + if [[ "${FW_DIARIZE:-}" == "1" ]]; then + python -m pip install -q --retries 1 --upgrade soundfile speechbrain || \ + log "Warning: failed to install soundfile/speechbrain" + # Torch and torchaudio CPU wheels (force to avoid mismatched CUDA builds) + python -m pip install -q --retries 1 --upgrade --force-reinstall --index-url https://download.pytorch.org/whl/cpu torch torchaudio || \ + log "Warning: failed to install torch/torchaudio CPU wheels" + fi + python - <<'PY' +import sys +print(f"[PY] Python {sys.version.split()[0]} dependencies installed.") +PY +} + +ensure_runner() { + if [[ ! -f "$PY_RUNNER" ]]; then + echo "Runner not found: $PY_RUNNER" >&2 + exit 3 + fi +} + +generate_test_audio() { + local tmpwav + tmpwav="${PROJECT_DIR}/test_fw.wav" + if command -v espeak-ng >/dev/null 2>&1; then + log "Generating test audio via espeak-ng -> $tmpwav" >&2 + espeak-ng -w "$tmpwav" "This is a quick test of faster whisper transcription." >/dev/null 2>&1 || true + fi + # If espeak-ng failed or not present, try espeak + if [[ ! -s "$tmpwav" ]] && command -v espeak >/dev/null 2>&1; then + log "espeak-ng unavailable or failed; trying espeak -> $tmpwav" >&2 + espeak -w "$tmpwav" "This is a quick test of faster whisper transcription." >/dev/null 2>&1 || true + fi + # Fallback: generate tone via Python stdlib (no external deps) + if [[ ! -s "$tmpwav" ]]; then + log "Generating 3s 1kHz WAV via Python stdlib -> $tmpwav" >&2 + python3 -c 'import sys,wave,math,array;outfile=sys.argv[1];fr=16000;dur=3;freq=1000.0;ampl=0.3;n=fr*dur;data=array.array("h",[int(max(-1.0,min(1.0,ampl*math.sin(2*math.pi*freq*(i/fr))))*32767) for i in range(n)]);wf=wave.open(outfile,"w");wf.setnchannels(1);wf.setsampwidth(2);wf.setframerate(fr);wf.writeframes(data.tobytes());wf.close()' "$tmpwav" || true + fi + # Final fallback: tone via ffmpeg + if [[ ! -s "$tmpwav" ]]; then + log "Creating a 3s sine tone WAV via ffmpeg -> $tmpwav" >&2 + ffmpeg -f lavfi -i sine=frequency=1000:duration=3 -ar 16000 -ac 1 -f wav -y "$tmpwav" >/dev/null 2>&1 || true + fi + echo "$tmpwav" +} + +prepare_model() { + # Download a model for offline use into MODEL_DIR + local name="$1" + mkdir -p "$MODEL_DIR" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + log "Preparing model '$name' into $MODEL_DIR" + python - <&2 + exit 2 + fi + install_python_deps 0 + export FW_PREPARE_NAME="$PREPARE_MODEL" + export FW_MODEL_DIR="$MODEL_DIR" + prepare_model "$PREPARE_MODEL" + log "Model '$PREPARE_MODEL' downloaded to $MODEL_DIR" + exit 0 + fi + + # Detect NVIDIA GPU and enforce CUDA if present + has_nvidia=0 + if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then + has_nvidia=1 + fi + install_python_deps "$has_nvidia" + ensure_runner + + local input="$INPUT_FILE" + if [[ -z "$input" ]]; then + input="$(generate_test_audio)" + if [[ ! -s "$input" ]]; then + echo "Failed to generate test audio. Please provide an audio file." >&2 + exit 4 + fi + fi + + if [[ ! -f "$input" ]]; then + echo "Input file not found: $input" >&2 + exit 2 + fi + + local args=("$input" "--model" "$MODEL") + [[ -n "$LANGUAGE" ]] && args+=("--language" "$LANGUAGE") + [[ -n "$OUTDIR" ]] && args+=("--outdir" "$OUTDIR") + + # Pass diarization via env if requested + if [[ "${FW_DIARIZE:-}" == "1" ]]; then + args+=("--diarize") + if [[ -n "${FW_NUM_SPEAKERS:-}" ]]; then + args+=("--num-speakers" "${FW_NUM_SPEAKERS}") + fi + fi + + if [[ $has_nvidia -eq 1 ]]; then + ensure_cuda_runtime + # Export common CUDA paths in case the env lacks them + export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + # Include system and possible venv-provided CUDA libs + local pyver venv_cuda_paths="" + if [[ -x "$VENV_DIR/bin/python" ]]; then + pyver="$($VENV_DIR/bin/python -c 'import sys;print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)" + if [[ -n "$pyver" ]]; then + venv_cuda_paths="$VENV_DIR/lib/python$pyver/site-packages/nvidia/cublas/lib:$VENV_DIR/lib/python$pyver/site-packages/nvidia/cudnn/lib:$VENV_DIR/lib/python$pyver/site-packages/nvidia/cuda_runtime/lib" + fi + fi + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:${CUDA_HOME}/lib64:/usr/lib/x86_64-linux-gnu:/opt/cuda/lib64:/opt/cuda/targets/x86_64-linux/lib:${venv_cuda_paths}" + export PATH="${PATH}:${CUDA_HOME}/bin" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + python -c 'from faster_whisper import WhisperModel; WhisperModel("tiny", device="cuda", compute_type="float16"); print("[PY] CUDA test init succeeded.")' || { echo "CUDA environment check failed. Aborting as requested." >&2; exit 6; } + args+=("--device" "cuda") + fi + + log "Transcribing: $input" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + if [[ $has_nvidia -eq 1 ]]; then + if ! python "$PY_RUNNER" "${args[@]}"; then + echo "CUDA execution requested due to detected NVIDIA GPU, but it failed. Aborting as requested (no CPU fallback)." >&2 + exit 6 + fi + else + # Offline: prefer local directory if present; otherwise use cache without network + if [[ $OFFLINE -eq 1 ]]; then + local local_model_path="" + if [[ -d "$MODEL" ]]; then + local_model_path="$MODEL" + elif [[ -d "$MODEL_DIR/$MODEL" ]]; then + local_model_path="$MODEL_DIR/$MODEL" + fi + if [[ -n "$local_model_path" ]]; then + args=("$input" "--model" "$local_model_path") + [[ -n "$LANGUAGE" ]] && args+=("--language" "$LANGUAGE") + [[ -n "$OUTDIR" ]] && args+=("--outdir" "$OUTDIR") + fi + fi + python "$PY_RUNNER" "${args[@]}" + fi +} + +main "$@" +