mirror of
https://github.com/kuhyx/scripts.git
synced 2026-07-04 15:23:11 +02:00
feat: offline local transcribtion
This commit is contained in:
parent
68fbd82d78
commit
60517b4584
3
.gitignore
vendored
3
.gitignore
vendored
@ -5,4 +5,5 @@
|
|||||||
*.ogg*
|
*.ogg*
|
||||||
*.wav*
|
*.wav*
|
||||||
*.m4a*
|
*.m4a*
|
||||||
main_folder
|
main_folder
|
||||||
|
models
|
||||||
|
|||||||
21
.vscode/tasks.json
vendored
Normal file
21
.vscode/tasks.json
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"label": "Transcribe tiny online smoke test",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "bash",
|
||||||
|
"args": [
|
||||||
|
"/home/kuhy/testsAndMisc/Bash/transcribe.sh",
|
||||||
|
"--online",
|
||||||
|
"-m",
|
||||||
|
"tiny"
|
||||||
|
],
|
||||||
|
"isBackground": false,
|
||||||
|
"problemMatcher": [
|
||||||
|
"$gcc"
|
||||||
|
],
|
||||||
|
"group": "build"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
4
test_fw.srt
Normal file
4
test_fw.srt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
1
|
||||||
|
00:00:00,000 --> 00:00:02,760
|
||||||
|
This is a quick test on faster with but run creep shun.
|
||||||
|
|
||||||
338
tools/transcribe_fw.py
Normal file
338
tools/transcribe_fw.py
Normal file
@ -0,0 +1,338 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import timedelta
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def format_timestamp(seconds: float) -> str:
|
||||||
|
td = timedelta(seconds=seconds)
|
||||||
|
# Ensure SRT format HH:MM:SS,mmm
|
||||||
|
total_seconds = int(td.total_seconds())
|
||||||
|
hours = total_seconds // 3600
|
||||||
|
minutes = (total_seconds % 3600) // 60
|
||||||
|
secs = total_seconds % 60
|
||||||
|
millis = int((seconds - int(seconds)) * 1000)
|
||||||
|
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
||||||
|
|
||||||
|
|
||||||
|
def write_srt(segments, srt_path: str):
|
||||||
|
with open(srt_path, "w", encoding="utf-8") as f:
|
||||||
|
for i, seg in enumerate(segments, start=1):
|
||||||
|
start = format_timestamp(seg.start)
|
||||||
|
end = format_timestamp(seg.end)
|
||||||
|
text = (seg.text or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
def write_txt(segments, txt_path: str):
|
||||||
|
with open(txt_path, "w", encoding="utf-8") as f:
|
||||||
|
for seg in segments:
|
||||||
|
text = (seg.text or "").strip()
|
||||||
|
if text:
|
||||||
|
f.write(text + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def write_srt_with_speakers(segments, labels: List[int], path: str):
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for i, (seg, lab) in enumerate(zip(segments, labels), start=1):
|
||||||
|
text = (seg.text or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
spk = f"SPK{lab+1}"
|
||||||
|
f.write(f"{i}\n{format_timestamp(seg.start)} --> {format_timestamp(seg.end)}\n[{spk}] {text}\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
def write_txt_with_speakers(segments, labels: List[int], path: str):
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for seg, lab in zip(segments, labels):
|
||||||
|
text = (seg.text or "").strip()
|
||||||
|
if text:
|
||||||
|
spk = f"SPK{lab+1}"
|
||||||
|
f.write(f"[{spk}] {text}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def write_rttm(segments, labels: List[int], path: str, file_id: str = "audio"):
|
||||||
|
# RTTM format: SPEAKER <file-id> 1 <start> <duration> <ortho> <stype> <name> <conf>
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for seg, lab in zip(segments, labels):
|
||||||
|
start = float(getattr(seg, "start", 0.0) or 0.0)
|
||||||
|
end = float(getattr(seg, "end", start) or start)
|
||||||
|
dur = max(0.0, end - start)
|
||||||
|
name = f"SPK{lab+1}"
|
||||||
|
f.write(f"SPEAKER {file_id} 1 {start:.3f} {dur:.3f} <NA> <NA> {name} <NA>\n")
|
||||||
|
|
||||||
|
|
||||||
|
def hhmmss(seconds: float) -> str:
|
||||||
|
seconds = max(0.0, float(seconds))
|
||||||
|
total_seconds = int(seconds)
|
||||||
|
h = total_seconds // 3600
|
||||||
|
m = (total_seconds % 3600) // 60
|
||||||
|
s = total_seconds % 60
|
||||||
|
return f"{h:02d}:{m:02d}:{s:02d}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_media_duration(path: str) -> float | None:
|
||||||
|
"""Try to get media duration in seconds using ffmpeg-python or ffprobe.
|
||||||
|
Returns None if unavailable.
|
||||||
|
"""
|
||||||
|
# Try ffmpeg-python first (if installed) which uses ffprobe under the hood
|
||||||
|
try:
|
||||||
|
import ffmpeg # type: ignore
|
||||||
|
|
||||||
|
probe = ffmpeg.probe(path)
|
||||||
|
fmt = probe.get("format", {})
|
||||||
|
if "duration" in fmt:
|
||||||
|
return float(fmt["duration"]) # type: ignore
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: call ffprobe directly if available
|
||||||
|
if shutil.which("ffprobe"):
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(
|
||||||
|
[
|
||||||
|
"ffprobe",
|
||||||
|
"-v",
|
||||||
|
"error",
|
||||||
|
"-show_entries",
|
||||||
|
"format=duration",
|
||||||
|
"-of",
|
||||||
|
"default=noprint_wrappers=1:nokey=1",
|
||||||
|
path,
|
||||||
|
],
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
return float(out.decode().strip())
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _resample_linear(x, src_sr: int, tgt_sr: int):
|
||||||
|
import numpy as np
|
||||||
|
if src_sr == tgt_sr:
|
||||||
|
return x
|
||||||
|
ratio = float(tgt_sr) / float(src_sr)
|
||||||
|
n_out = max(1, int(round(x.shape[-1] * ratio)))
|
||||||
|
xp = np.linspace(0.0, 1.0, num=x.shape[-1], endpoint=False)
|
||||||
|
xq = np.linspace(0.0, 1.0, num=n_out, endpoint=False)
|
||||||
|
y = np.interp(xq, xp, x.astype(np.float32))
|
||||||
|
return y.astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def _kmeans_cosine(embs, k: int, iters: int = 50, seed: int = 0):
|
||||||
|
import numpy as np
|
||||||
|
rng = np.random.default_rng(seed)
|
||||||
|
X = np.asarray(embs, dtype=np.float32)
|
||||||
|
if X.ndim != 2 or X.shape[0] == 0:
|
||||||
|
return np.zeros((0,), dtype=np.int64)
|
||||||
|
# Normalize
|
||||||
|
X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
|
||||||
|
# Init centroids as random samples
|
||||||
|
idxs = rng.choice(X.shape[0], size=min(k, X.shape[0]), replace=False)
|
||||||
|
C = X[idxs]
|
||||||
|
# If fewer samples than k, pad with random
|
||||||
|
if C.shape[0] < k:
|
||||||
|
pad = rng.standard_normal(size=(k - C.shape[0], X.shape[1])).astype(np.float32)
|
||||||
|
pad /= (np.linalg.norm(pad, axis=1, keepdims=True) + 1e-8)
|
||||||
|
C = np.concatenate([C, pad], axis=0)
|
||||||
|
for _ in range(iters):
|
||||||
|
# Assign by cosine similarity (maximize dot product)
|
||||||
|
sims = X @ C.T # (n, k)
|
||||||
|
labels = sims.argmax(axis=1)
|
||||||
|
newC = np.zeros_like(C)
|
||||||
|
for j in range(k):
|
||||||
|
sel = X[labels == j]
|
||||||
|
if sel.shape[0] == 0:
|
||||||
|
newC[j] = C[j]
|
||||||
|
else:
|
||||||
|
v = sel.mean(axis=0)
|
||||||
|
v /= (np.linalg.norm(v) + 1e-8)
|
||||||
|
newC[j] = v
|
||||||
|
if np.allclose(newC, C, atol=1e-4):
|
||||||
|
break
|
||||||
|
C = newC
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def diarize_segments(audio_path: str, segments, num_speakers: int = 2) -> Optional[list]:
|
||||||
|
"""Simple diarization: compute speaker embeddings per segment and cluster with KMeans.
|
||||||
|
Returns a list of speaker labels aligned with segments, or None on failure.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
from speechbrain.pretrained import EncoderClassifier
|
||||||
|
import torch
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Diarization dependencies missing ({e}); skipping speaker labels.", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Load audio
|
||||||
|
try:
|
||||||
|
wav, sr = sf.read(audio_path, dtype="float32", always_2d=False)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Could not read audio for diarization: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
if wav.ndim == 2: # mixdown
|
||||||
|
wav = wav.mean(axis=1)
|
||||||
|
# Resample to 16k for ECAPA
|
||||||
|
wav16 = _resample_linear(wav, sr, 16000)
|
||||||
|
|
||||||
|
# Load speaker embedding model (CPU is fine)
|
||||||
|
try:
|
||||||
|
classifier = EncoderClassifier.from_hparams(
|
||||||
|
source="speechbrain/spkrec-ecapa-voxceleb",
|
||||||
|
run_opts={"device": "cpu"},
|
||||||
|
savedir=os.path.join(os.path.expanduser("~"), ".cache", "speechbrain_ecapa")
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Could not load speaker embedding model: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
embs = []
|
||||||
|
# Extract embedding per segment window
|
||||||
|
for seg in segments:
|
||||||
|
s = float(getattr(seg, "start", 0.0) or 0.0)
|
||||||
|
e = float(getattr(seg, "end", s) or s)
|
||||||
|
if e <= s:
|
||||||
|
e = s + 0.2 # minimal window
|
||||||
|
# Convert to samples in 16k
|
||||||
|
i0 = int(s * 16000)
|
||||||
|
i1 = int(e * 16000)
|
||||||
|
# Add small margins to help very short segments
|
||||||
|
pad = int(0.05 * 16000)
|
||||||
|
i0 = max(0, i0 - pad)
|
||||||
|
i1 = min(len(wav16), i1 + pad)
|
||||||
|
if i1 - i0 < 1600: # <0.1s, too short; expand if possible
|
||||||
|
i1 = min(len(wav16), i0 + 1600)
|
||||||
|
segment_wav = torch.tensor(wav16[i0:i1]).unsqueeze(0)
|
||||||
|
with torch.no_grad():
|
||||||
|
emb = classifier.encode_batch(segment_wav).squeeze(0).squeeze(0).cpu().numpy()
|
||||||
|
embs.append(emb.astype("float32"))
|
||||||
|
|
||||||
|
if len(embs) == 0:
|
||||||
|
return None
|
||||||
|
# Cluster
|
||||||
|
labels = _kmeans_cosine(embs, k=max(1, int(num_speakers)))
|
||||||
|
return labels.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Transcribe audio with faster-whisper and write .txt and .srt")
|
||||||
|
parser.add_argument("input", help="Path to audio/video file")
|
||||||
|
parser.add_argument("--model", default=os.environ.get("FW_MODEL", "large-v3"), help="Model size or path (default: large-v3)")
|
||||||
|
parser.add_argument("--language", default=None, help="Language code (e.g., en). Leave None for auto-detect")
|
||||||
|
parser.add_argument("--device", default=os.environ.get("FW_DEVICE", "auto"), choices=["auto", "cpu", "cuda"], help="Device to run on")
|
||||||
|
parser.add_argument("--compute-type", dest="compute_type", default=os.environ.get("FW_COMPUTE", "auto"), help="Compute type (auto,int8,float16,float32,int8_float16,etc.)")
|
||||||
|
parser.add_argument("--outdir", default=None, help="Output directory (default: next to input)")
|
||||||
|
parser.add_argument("--no-progress", action="store_true", help="Disable live progress output")
|
||||||
|
parser.add_argument("--diarize", action="store_true", help="Enable speaker diarization (labels)")
|
||||||
|
parser.add_argument("--num-speakers", type=int, default=int(os.environ.get("FW_NUM_SPEAKERS", "2")), help="Assumed number of speakers (default: 2)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
except Exception as e:
|
||||||
|
print("[ERROR] faster-whisper is not installed in this environment.", file=sys.stderr)
|
||||||
|
print(str(e), file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
inp = os.path.abspath(args.input)
|
||||||
|
if not os.path.exists(inp):
|
||||||
|
print(f"[ERROR] Input file not found: {inp}", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
outdir = os.path.abspath(args.outdir or os.path.dirname(inp) or ".")
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
base = os.path.splitext(os.path.basename(inp))[0]
|
||||||
|
srt_path = os.path.join(outdir, base + ".srt")
|
||||||
|
txt_path = os.path.join(outdir, base + ".txt")
|
||||||
|
|
||||||
|
# Device and compute_type heuristics
|
||||||
|
device = args.device
|
||||||
|
compute_type = args.compute_type
|
||||||
|
if device == "auto":
|
||||||
|
device = "cpu"
|
||||||
|
if compute_type == "auto":
|
||||||
|
# Prefer accuracy over speed by default
|
||||||
|
compute_type = "float16" if device == "cuda" else "float32"
|
||||||
|
|
||||||
|
print(f"[INFO] Loading model='{args.model}', device='{device}', compute_type='{compute_type}'")
|
||||||
|
model = WhisperModel(args.model, device=device, compute_type=compute_type)
|
||||||
|
|
||||||
|
# Transcription with live progress
|
||||||
|
total_duration = get_media_duration(inp)
|
||||||
|
if total_duration:
|
||||||
|
print(f"[INFO] Media duration: {hhmmss(total_duration)}")
|
||||||
|
start_ts = time.time()
|
||||||
|
|
||||||
|
iter_segments, info = model.transcribe(inp, language=args.language)
|
||||||
|
collected = []
|
||||||
|
processed = 0.0
|
||||||
|
last_print = 0.0
|
||||||
|
tty = sys.stderr.isatty()
|
||||||
|
for seg in iter_segments:
|
||||||
|
collected.append(seg)
|
||||||
|
# Update processed time from segment end if available
|
||||||
|
if getattr(seg, "end", None) is not None:
|
||||||
|
processed = max(processed, float(seg.end))
|
||||||
|
now = time.time()
|
||||||
|
# Print each segment or throttle to ~5 per second
|
||||||
|
if not args.no_progress and (tty or (now - last_print) >= 0.2):
|
||||||
|
last_print = now
|
||||||
|
if total_duration and total_duration > 0:
|
||||||
|
pct = max(0.0, min(100.0, (processed / total_duration) * 100.0))
|
||||||
|
elapsed = now - start_ts
|
||||||
|
eta = None
|
||||||
|
if processed > 0:
|
||||||
|
rate = processed / max(1e-6, elapsed)
|
||||||
|
remaining = max(0.0, total_duration - processed)
|
||||||
|
eta = remaining / max(1e-6, rate)
|
||||||
|
line = f"[PROGRESS] {hhmmss(processed)} / {hhmmss(total_duration)} ({pct:5.1f}%)"
|
||||||
|
if eta is not None and eta < 60 * 60 * 24: # cap unrealistic values
|
||||||
|
line += f" ETA ~{hhmmss(eta)}"
|
||||||
|
else:
|
||||||
|
line = f"[PROGRESS] processed {hhmmss(processed)}"
|
||||||
|
if tty:
|
||||||
|
print("\r" + line, end="", file=sys.stderr, flush=True)
|
||||||
|
else:
|
||||||
|
print(line, file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
# Finish progress line
|
||||||
|
if not args.no_progress and sys.stderr.isatty():
|
||||||
|
print("", file=sys.stderr) # newline
|
||||||
|
|
||||||
|
print(f"[INFO] Detected language: {getattr(info, 'language', None)} (prob={getattr(info, 'language_probability', None)})")
|
||||||
|
print(f"[INFO] Segments: {len(collected)}")
|
||||||
|
|
||||||
|
# Optionally diarize
|
||||||
|
if args.diarize:
|
||||||
|
labels = diarize_segments(inp, collected, num_speakers=args.num_speakers)
|
||||||
|
if labels is not None and len(labels) == len(collected):
|
||||||
|
diar_srt = os.path.join(outdir, base + ".diar.srt")
|
||||||
|
diar_txt = os.path.join(outdir, base + ".diar.txt")
|
||||||
|
rttm_path = os.path.join(outdir, base + ".rttm")
|
||||||
|
write_srt_with_speakers(collected, labels, diar_srt)
|
||||||
|
write_txt_with_speakers(collected, labels, diar_txt)
|
||||||
|
write_rttm(collected, labels, rttm_path, file_id=base)
|
||||||
|
print(f"[OK] Wrote: {diar_txt}\n[OK] Wrote: {diar_srt}\n[OK] Wrote: {rttm_path}")
|
||||||
|
else:
|
||||||
|
print("[WARN] Diarization failed or returned mismatched labels; writing plain outputs.", file=sys.stderr)
|
||||||
|
|
||||||
|
# Write base outputs
|
||||||
|
write_txt(collected, txt_path)
|
||||||
|
write_srt(collected, srt_path)
|
||||||
|
print(f"[OK] Wrote: {txt_path}\n[OK] Wrote: {srt_path}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
430
transcribe.sh
Normal file
430
transcribe.sh
Normal file
@ -0,0 +1,430 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Transcribe an audio file using faster-whisper with automatic setup.
|
||||||
|
# - Creates Python venv in .venv
|
||||||
|
# - Installs ffmpeg and espeak-ng (best-effort) for test audio generation
|
||||||
|
# - Installs faster-whisper (and CUDA stack if NVIDIA is present)
|
||||||
|
# - Runs tools/transcribe_fw.py to produce .txt and .srt next to the input
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_DIR="$SCRIPT_DIR"
|
||||||
|
TOOLS_DIR="$PROJECT_DIR/tools"
|
||||||
|
PY_RUNNER="$TOOLS_DIR/transcribe_fw.py"
|
||||||
|
VENV_DIR="$PROJECT_DIR/.venv"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<USAGE
|
||||||
|
Usage: $(basename "$0") [--online] [--prepare-model NAME --model-dir DIR] [-m model] [-l lang] [-o outdir] [audio_file]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--online Allow network to install deps and/or download models (default: offline)
|
||||||
|
--prepare-model NAME Download a model for offline use (implies --online)
|
||||||
|
--model-dir DIR Directory to store or load local models (default: ./models)
|
||||||
|
-m model Model size or path (tiny, base, small, medium, large-v3, etc.). Default: large-v3
|
||||||
|
-l lang Language code (e.g., en). Default: auto-detect
|
||||||
|
-o outdir Output directory (default: alongside input)
|
||||||
|
[env] FW_DIARIZE=1 Enable diarization (speaker labels). Optional: FW_NUM_SPEAKERS=N. When --online, installs soundfile, speechbrain, and CPU-only torch/torchaudio.
|
||||||
|
-h Show help
|
||||||
|
USAGE
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date +'%H:%M:%S')]" "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
detect_pkg_mgr() {
|
||||||
|
if command -v apt-get >/dev/null 2>&1; then echo apt; return; fi
|
||||||
|
if command -v dnf >/dev/null 2>&1; then echo dnf; return; fi
|
||||||
|
if command -v yum >/dev/null 2>&1; then echo yum; return; fi
|
||||||
|
if command -v pacman >/dev/null 2>&1; then echo pacman; return; fi
|
||||||
|
if command -v zypper >/dev/null 2>&1; then echo zypper; return; fi
|
||||||
|
echo none
|
||||||
|
}
|
||||||
|
|
||||||
|
has_libcublas12() {
|
||||||
|
# Common system locations
|
||||||
|
for d in \
|
||||||
|
/usr/lib \
|
||||||
|
/usr/lib64 \
|
||||||
|
/usr/local/cuda/lib64 \
|
||||||
|
/usr/local/cuda-12*/lib64 \
|
||||||
|
/opt/cuda/lib64 \
|
||||||
|
/opt/cuda/targets/x86_64-linux/lib; do
|
||||||
|
[[ -e "$d/libcublas.so.12" ]] && return 0 || true
|
||||||
|
done
|
||||||
|
# venv-provided NVIDIA CUDA libs
|
||||||
|
if [[ -x "$VENV_DIR/bin/python" ]]; then
|
||||||
|
local pyver
|
||||||
|
pyver="$($VENV_DIR/bin/python -c 'import sys;print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)"
|
||||||
|
if [[ -n "$pyver" ]]; then
|
||||||
|
for d in "$VENV_DIR/lib/python$pyver/site-packages/nvidia/cublas/lib" \
|
||||||
|
"$VENV_DIR/lib/python$pyver/site-packages/nvidia/cudnn/lib" \
|
||||||
|
"$VENV_DIR/lib/python$pyver/site-packages/nvidia/cuda_runtime/lib"; do
|
||||||
|
[[ -e "$d/libcublas.so.12" ]] && return 0 || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_cuda_runtime() {
|
||||||
|
local mgr; mgr="$(detect_pkg_mgr)"
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
if has_libcublas12; then return 0; fi
|
||||||
|
echo "CUDA runtime (libcublas.so.12) not found and offline mode is enabled. Install CUDA 12 runtime or rerun with --online." >&2
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
|
if has_libcublas12; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if ! command -v sudo >/dev/null 2>&1; then
|
||||||
|
log "sudo not found; skipping CUDA runtime install attempt."
|
||||||
|
else
|
||||||
|
log "CUDA cuBLAS 12 not found; attempting to install CUDA runtime (manager: $mgr)"
|
||||||
|
set +e
|
||||||
|
case "$mgr" in
|
||||||
|
pacman)
|
||||||
|
sudo pacman -Sy --noconfirm cuda cudnn || true ;;
|
||||||
|
apt)
|
||||||
|
sudo apt-get update -y || true
|
||||||
|
sudo apt-get install -y nvidia-cuda-toolkit || true ;;
|
||||||
|
dnf|yum)
|
||||||
|
sudo "$mgr" install -y cuda cudnn || true ;;
|
||||||
|
zypper)
|
||||||
|
sudo zypper install -y cuda cudnn || true ;;
|
||||||
|
*) log "Unknown package manager; cannot install CUDA automatically." ;;
|
||||||
|
esac
|
||||||
|
set -e
|
||||||
|
fi
|
||||||
|
# Re-check
|
||||||
|
if ! has_libcublas12; then
|
||||||
|
echo "CUDA runtime (libcublas.so.12) not found after attempted install. Please install CUDA 12 toolkit/runtime and re-run." >&2
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_system_deps() {
|
||||||
|
have_cmd() { command -v "$1" >/dev/null 2>&1; }
|
||||||
|
local need_ffmpeg=0 need_espeak=0
|
||||||
|
have_cmd ffmpeg || need_ffmpeg=1
|
||||||
|
have_cmd espeak-ng || need_espeak=1
|
||||||
|
|
||||||
|
# If diarization requested and online, we may also try to ensure libsndfile
|
||||||
|
local need_libsndfile=0
|
||||||
|
if [[ "${FW_DIARIZE:-}" == "1" ]]; then
|
||||||
|
# Heuristic: check common library file
|
||||||
|
if [[ ! -e /usr/lib/x86_64-linux-gnu/libsndfile.so && ! -e /usr/lib/libsndfile.so && ! -e /usr/lib64/libsndfile.so ]]; then
|
||||||
|
need_libsndfile=1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $need_ffmpeg -eq 0 && $need_espeak -eq 0 && $need_libsndfile -eq 0 ]]; then
|
||||||
|
log "System deps present: ffmpeg, espeak-ng${FW_DIARIZE:+, libsndfile}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
echo "Missing system dependencies (ffmpeg/espeak-ng) but running in offline mode. Install them or rerun with --online." >&2
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
|
|
||||||
|
local mgr; mgr="$(detect_pkg_mgr)"
|
||||||
|
log "Detected package manager: $mgr (installing missing: $([[ $need_ffmpeg -eq 1 ]] && echo ffmpeg )$([[ $need_espeak -eq 1 ]] && echo espeak-ng )$([[ $need_libsndfile -eq 1 ]] && echo libsndfile))"
|
||||||
|
|
||||||
|
if ! command -v sudo >/dev/null 2>&1; then
|
||||||
|
log "sudo not found; skipping system package installation attempt."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Avoid exiting on install errors; continue best-effort
|
||||||
|
set +e
|
||||||
|
case "$mgr" in
|
||||||
|
apt)
|
||||||
|
sudo apt-get update -y || log "apt-get update failed; continuing"
|
||||||
|
pkgs=(python3-venv python3-pip)
|
||||||
|
[[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg)
|
||||||
|
[[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng)
|
||||||
|
if [[ $need_libsndfile -eq 1 ]]; then
|
||||||
|
# Try both names across releases
|
||||||
|
pkgs+=(libsndfile1)
|
||||||
|
sudo apt-get install -y libsndfile1 || true
|
||||||
|
# If that failed, try libsndfile2 (newer distros)
|
||||||
|
sudo apt-get install -y libsndfile2 || true
|
||||||
|
fi
|
||||||
|
sudo apt-get install -y "${pkgs[@]}" || log "apt-get install failed; continuing" ;;
|
||||||
|
dnf)
|
||||||
|
pkgs=(python3-venv python3-pip)
|
||||||
|
[[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg)
|
||||||
|
[[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng)
|
||||||
|
[[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile)
|
||||||
|
sudo dnf install -y "${pkgs[@]}" || log "dnf install failed; continuing" ;;
|
||||||
|
yum)
|
||||||
|
pkgs=(python3-venv python3-pip)
|
||||||
|
[[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg)
|
||||||
|
[[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng)
|
||||||
|
[[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile)
|
||||||
|
sudo yum install -y "${pkgs[@]}" || log "yum install failed; continuing" ;;
|
||||||
|
pacman)
|
||||||
|
pkgs=(python-virtualenv python-pip)
|
||||||
|
[[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg)
|
||||||
|
[[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng)
|
||||||
|
[[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile)
|
||||||
|
sudo pacman -Sy --noconfirm "${pkgs[@]}" || log "pacman install failed; continuing" ;;
|
||||||
|
zypper)
|
||||||
|
pkgs=(python311-virtualenv python311-pip)
|
||||||
|
[[ $need_ffmpeg -eq 1 ]] && pkgs+=(ffmpeg)
|
||||||
|
[[ $need_espeak -eq 1 ]] && pkgs+=(espeak-ng)
|
||||||
|
[[ $need_libsndfile -eq 1 ]] && pkgs+=(libsndfile1)
|
||||||
|
sudo zypper install -y "${pkgs[@]}" || log "zypper install failed; continuing" ;;
|
||||||
|
*)
|
||||||
|
log "Unknown package manager; please ensure ffmpeg and espeak-ng are installed." ;;
|
||||||
|
esac
|
||||||
|
set -e
|
||||||
|
}
|
||||||
|
|
||||||
|
setup_venv() {
|
||||||
|
if [[ ! -d "$VENV_DIR" ]]; then
|
||||||
|
log "Creating venv at $VENV_DIR"
|
||||||
|
python3 -m venv "$VENV_DIR"
|
||||||
|
fi
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
if [[ $OFFLINE -eq 0 ]]; then
|
||||||
|
python -m pip install --upgrade pip wheel setuptools
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_python_deps() {
|
||||||
|
# Install deps; if NVIDIA GPU is present, prefer CUDA-capable stack (cu12)
|
||||||
|
local has_nvidia_flag="${1:-0}"
|
||||||
|
log "Installing faster-whisper and dependencies"
|
||||||
|
export PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||||
|
export PIP_DEFAULT_TIMEOUT=${PIP_DEFAULT_TIMEOUT:-20}
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
# Offline: do not install, just verify modules
|
||||||
|
if ! python -c 'import faster_whisper' >/dev/null 2>&1; then
|
||||||
|
echo "Python dependency 'faster_whisper' not found in offline mode. Run with --online to install." >&2
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
|
# If diarization requested offline, check for its deps too (warn-only)
|
||||||
|
if [[ "${FW_DIARIZE:-}" == "1" ]]; then
|
||||||
|
python - <<'PY' || true
|
||||||
|
try:
|
||||||
|
import soundfile, speechbrain, torch # noqa: F401
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[WARN] Diarization deps missing offline ({e}); speaker labels will be skipped.")
|
||||||
|
PY
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [[ "$has_nvidia_flag" -eq 1 ]]; then
|
||||||
|
# If ctranslate2 is not installed, attempt CUDA-enabled wheel (quiet, with fallback)
|
||||||
|
if ! "$VENV_DIR/bin/python" -c 'import ctranslate2' >/dev/null 2>&1; then
|
||||||
|
log "Installing CUDA-enabled CTranslate2 (cu12 wheel)"
|
||||||
|
python -m pip install -q --retries 1 --upgrade "ctranslate2<5,>=4.0" --extra-index-url https://download.opennmt.net/ctranslate2/cu12 || \
|
||||||
|
log "Warning: could not reach cu12 wheel index; will proceed with available ctranslate2"
|
||||||
|
fi
|
||||||
|
# Ensure NVIDIA CUDA 12 runtime libs are available inside the venv
|
||||||
|
python -m pip install -q --retries 1 --upgrade nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 || \
|
||||||
|
log "Warning: failed to install NVIDIA cu12 runtime libs via pip"
|
||||||
|
fi
|
||||||
|
python -m pip install -q --retries 1 --upgrade faster-whisper ffmpeg-python
|
||||||
|
|
||||||
|
# If diarization requested and online, install its Python deps best-effort
|
||||||
|
if [[ "${FW_DIARIZE:-}" == "1" ]]; then
|
||||||
|
python -m pip install -q --retries 1 --upgrade soundfile speechbrain || \
|
||||||
|
log "Warning: failed to install soundfile/speechbrain"
|
||||||
|
# Torch and torchaudio CPU wheels (force to avoid mismatched CUDA builds)
|
||||||
|
python -m pip install -q --retries 1 --upgrade --force-reinstall --index-url https://download.pytorch.org/whl/cpu torch torchaudio || \
|
||||||
|
log "Warning: failed to install torch/torchaudio CPU wheels"
|
||||||
|
fi
|
||||||
|
python - <<'PY'
|
||||||
|
import sys
|
||||||
|
print(f"[PY] Python {sys.version.split()[0]} dependencies installed.")
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_runner() {
|
||||||
|
if [[ ! -f "$PY_RUNNER" ]]; then
|
||||||
|
echo "Runner not found: $PY_RUNNER" >&2
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
generate_test_audio() {
|
||||||
|
local tmpwav
|
||||||
|
tmpwav="${PROJECT_DIR}/test_fw.wav"
|
||||||
|
if command -v espeak-ng >/dev/null 2>&1; then
|
||||||
|
log "Generating test audio via espeak-ng -> $tmpwav" >&2
|
||||||
|
espeak-ng -w "$tmpwav" "This is a quick test of faster whisper transcription." >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
# If espeak-ng failed or not present, try espeak
|
||||||
|
if [[ ! -s "$tmpwav" ]] && command -v espeak >/dev/null 2>&1; then
|
||||||
|
log "espeak-ng unavailable or failed; trying espeak -> $tmpwav" >&2
|
||||||
|
espeak -w "$tmpwav" "This is a quick test of faster whisper transcription." >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
# Fallback: generate tone via Python stdlib (no external deps)
|
||||||
|
if [[ ! -s "$tmpwav" ]]; then
|
||||||
|
log "Generating 3s 1kHz WAV via Python stdlib -> $tmpwav" >&2
|
||||||
|
python3 -c 'import sys,wave,math,array;outfile=sys.argv[1];fr=16000;dur=3;freq=1000.0;ampl=0.3;n=fr*dur;data=array.array("h",[int(max(-1.0,min(1.0,ampl*math.sin(2*math.pi*freq*(i/fr))))*32767) for i in range(n)]);wf=wave.open(outfile,"w");wf.setnchannels(1);wf.setsampwidth(2);wf.setframerate(fr);wf.writeframes(data.tobytes());wf.close()' "$tmpwav" || true
|
||||||
|
fi
|
||||||
|
# Final fallback: tone via ffmpeg
|
||||||
|
if [[ ! -s "$tmpwav" ]]; then
|
||||||
|
log "Creating a 3s sine tone WAV via ffmpeg -> $tmpwav" >&2
|
||||||
|
ffmpeg -f lavfi -i sine=frequency=1000:duration=3 -ar 16000 -ac 1 -f wav -y "$tmpwav" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
echo "$tmpwav"
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_model() {
|
||||||
|
# Download a model for offline use into MODEL_DIR
|
||||||
|
local name="$1"
|
||||||
|
mkdir -p "$MODEL_DIR"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
log "Preparing model '$name' into $MODEL_DIR"
|
||||||
|
python - <<PY
|
||||||
|
import sys, os
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
name = os.environ.get('FW_PREPARE_NAME')
|
||||||
|
root = os.environ.get('FW_MODEL_DIR')
|
||||||
|
print(f"[PY] Preparing model '{name}' into {root}")
|
||||||
|
WhisperModel(name, device="cpu", compute_type="int8", download_root=root)
|
||||||
|
print("[PY] Model prepared.")
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
# Defaults
|
||||||
|
OFFLINE=1
|
||||||
|
PREPARE_MODEL=""
|
||||||
|
MODEL_DIR="$PROJECT_DIR/models"
|
||||||
|
MODEL="large-v3"
|
||||||
|
LANGUAGE=""
|
||||||
|
OUTDIR=""
|
||||||
|
INPUT_FILE=""
|
||||||
|
|
||||||
|
# Parse args
|
||||||
|
PARSED=$(getopt -o m:l:o:h -l online,prepare-model:,model-dir: -- "$@") || { usage; exit 2; }
|
||||||
|
eval set -- "$PARSED"
|
||||||
|
while true; do
|
||||||
|
case "$1" in
|
||||||
|
-m) MODEL="$2"; shift 2;;
|
||||||
|
-l) LANGUAGE="$2"; shift 2;;
|
||||||
|
-o) OUTDIR="$2"; shift 2;;
|
||||||
|
-h) usage; exit 0;;
|
||||||
|
--online) OFFLINE=0; shift;;
|
||||||
|
--prepare-model) PREPARE_MODEL="$2"; OFFLINE=0; shift 2;;
|
||||||
|
--model-dir) MODEL_DIR="$2"; shift 2;;
|
||||||
|
--) shift; break;;
|
||||||
|
*) break;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
INPUT_FILE="${1:-}"
|
||||||
|
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
export HF_HUB_OFFLINE=1
|
||||||
|
export TRANSFORMERS_OFFLINE=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
install_system_deps
|
||||||
|
setup_venv
|
||||||
|
|
||||||
|
# If asked to prepare a model, do that and exit
|
||||||
|
if [[ -n "$PREPARE_MODEL" ]]; then
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
echo "--prepare-model requires network; rerun with --online." >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
install_python_deps 0
|
||||||
|
export FW_PREPARE_NAME="$PREPARE_MODEL"
|
||||||
|
export FW_MODEL_DIR="$MODEL_DIR"
|
||||||
|
prepare_model "$PREPARE_MODEL"
|
||||||
|
log "Model '$PREPARE_MODEL' downloaded to $MODEL_DIR"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Detect NVIDIA GPU and enforce CUDA if present
|
||||||
|
has_nvidia=0
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then
|
||||||
|
has_nvidia=1
|
||||||
|
fi
|
||||||
|
install_python_deps "$has_nvidia"
|
||||||
|
ensure_runner
|
||||||
|
|
||||||
|
local input="$INPUT_FILE"
|
||||||
|
if [[ -z "$input" ]]; then
|
||||||
|
input="$(generate_test_audio)"
|
||||||
|
if [[ ! -s "$input" ]]; then
|
||||||
|
echo "Failed to generate test audio. Please provide an audio file." >&2
|
||||||
|
exit 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$input" ]]; then
|
||||||
|
echo "Input file not found: $input" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
local args=("$input" "--model" "$MODEL")
|
||||||
|
[[ -n "$LANGUAGE" ]] && args+=("--language" "$LANGUAGE")
|
||||||
|
[[ -n "$OUTDIR" ]] && args+=("--outdir" "$OUTDIR")
|
||||||
|
|
||||||
|
# Pass diarization via env if requested
|
||||||
|
if [[ "${FW_DIARIZE:-}" == "1" ]]; then
|
||||||
|
args+=("--diarize")
|
||||||
|
if [[ -n "${FW_NUM_SPEAKERS:-}" ]]; then
|
||||||
|
args+=("--num-speakers" "${FW_NUM_SPEAKERS}")
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $has_nvidia -eq 1 ]]; then
|
||||||
|
ensure_cuda_runtime
|
||||||
|
# Export common CUDA paths in case the env lacks them
|
||||||
|
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
|
||||||
|
# Include system and possible venv-provided CUDA libs
|
||||||
|
local pyver venv_cuda_paths=""
|
||||||
|
if [[ -x "$VENV_DIR/bin/python" ]]; then
|
||||||
|
pyver="$($VENV_DIR/bin/python -c 'import sys;print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)"
|
||||||
|
if [[ -n "$pyver" ]]; then
|
||||||
|
venv_cuda_paths="$VENV_DIR/lib/python$pyver/site-packages/nvidia/cublas/lib:$VENV_DIR/lib/python$pyver/site-packages/nvidia/cudnn/lib:$VENV_DIR/lib/python$pyver/site-packages/nvidia/cuda_runtime/lib"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:${CUDA_HOME}/lib64:/usr/lib/x86_64-linux-gnu:/opt/cuda/lib64:/opt/cuda/targets/x86_64-linux/lib:${venv_cuda_paths}"
|
||||||
|
export PATH="${PATH}:${CUDA_HOME}/bin"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
python -c 'from faster_whisper import WhisperModel; WhisperModel("tiny", device="cuda", compute_type="float16"); print("[PY] CUDA test init succeeded.")' || { echo "CUDA environment check failed. Aborting as requested." >&2; exit 6; }
|
||||||
|
args+=("--device" "cuda")
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Transcribing: $input"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
if [[ $has_nvidia -eq 1 ]]; then
|
||||||
|
if ! python "$PY_RUNNER" "${args[@]}"; then
|
||||||
|
echo "CUDA execution requested due to detected NVIDIA GPU, but it failed. Aborting as requested (no CPU fallback)." >&2
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Offline: prefer local directory if present; otherwise use cache without network
|
||||||
|
if [[ $OFFLINE -eq 1 ]]; then
|
||||||
|
local local_model_path=""
|
||||||
|
if [[ -d "$MODEL" ]]; then
|
||||||
|
local_model_path="$MODEL"
|
||||||
|
elif [[ -d "$MODEL_DIR/$MODEL" ]]; then
|
||||||
|
local_model_path="$MODEL_DIR/$MODEL"
|
||||||
|
fi
|
||||||
|
if [[ -n "$local_model_path" ]]; then
|
||||||
|
args=("$input" "--model" "$local_model_path")
|
||||||
|
[[ -n "$LANGUAGE" ]] && args+=("--language" "$LANGUAGE")
|
||||||
|
[[ -n "$OUTDIR" ]] && args+=("--outdir" "$OUTDIR")
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
python "$PY_RUNNER" "${args[@]}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user