2025-10-12 14:46:55 +02:00
|
|
|
#!/usr/bin/env python3
|
2026-03-13 20:42:39 +01:00
|
|
|
"""Transcribe audio with faster-whisper and write .txt and .srt."""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2025-10-12 14:46:55 +02:00
|
|
|
import argparse
|
2026-03-13 20:42:39 +01:00
|
|
|
import importlib
|
|
|
|
|
import logging
|
2025-10-12 14:46:55 +02:00
|
|
|
import os
|
2026-03-13 20:42:39 +01:00
|
|
|
from pathlib import Path
|
2025-10-12 14:46:55 +02:00
|
|
|
import sys
|
|
|
|
|
import time
|
2026-03-13 20:42:39 +01:00
|
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
import types
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
# Constants
|
|
|
|
|
_PROGRESS_THROTTLE_SEC = 0.2
|
|
|
|
|
_SECONDS_PER_DAY = 60 * 60 * 24
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _try_import(name: str) -> types.ModuleType | None:
|
|
|
|
|
"""Attempt to import a module, returning None on failure."""
|
|
|
|
|
try:
|
|
|
|
|
return importlib.import_module(name)
|
|
|
|
|
except ImportError:
|
|
|
|
|
return None
|
2025-12-20 21:49:52 +01:00
|
|
|
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
def _parse_args() -> argparse.Namespace:
|
|
|
|
|
"""Parse command-line arguments."""
|
2026-02-20 01:17:53 +01:00
|
|
|
parser = argparse.ArgumentParser(
|
2026-03-21 17:51:36 +01:00
|
|
|
description=("Transcribe audio with faster-whisper and write .txt and .srt"),
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
2026-03-17 22:47:42 +01:00
|
|
|
parser.add_argument("input", help="Path to audio/video file")
|
2026-02-20 01:17:53 +01:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--model",
|
2026-03-17 22:47:42 +01:00
|
|
|
default=os.environ.get("FW_MODEL", "large-v3"),
|
2026-02-20 01:17:53 +01:00
|
|
|
help="Model size or path (default: large-v3)",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--language",
|
|
|
|
|
default=None,
|
2026-03-13 20:42:39 +01:00
|
|
|
help="Language code (e.g., en). None=auto",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--device",
|
|
|
|
|
default=os.environ.get("FW_DEVICE", "auto"),
|
|
|
|
|
choices=["auto", "cpu", "cuda"],
|
|
|
|
|
help="Device to run on",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--compute-type",
|
|
|
|
|
dest="compute_type",
|
|
|
|
|
default=os.environ.get("FW_COMPUTE", "auto"),
|
2026-03-13 20:42:39 +01:00
|
|
|
help="Compute type (auto,int8,float16,...)",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
2026-03-13 20:42:39 +01:00
|
|
|
"--outdir",
|
|
|
|
|
default=None,
|
|
|
|
|
help="Output dir (default: next to input)",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
2026-03-13 20:42:39 +01:00
|
|
|
"--no-progress",
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="Disable live progress output",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
2026-03-13 20:42:39 +01:00
|
|
|
"--diarize",
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="Enable speaker diarization (labels)",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--num-speakers",
|
|
|
|
|
type=int,
|
2026-03-17 22:47:42 +01:00
|
|
|
default=int(os.environ.get("FW_NUM_SPEAKERS", "2")),
|
2026-03-13 20:42:39 +01:00
|
|
|
help="Number of speakers (default: 2)",
|
2026-02-20 01:17:53 +01:00
|
|
|
)
|
2026-03-13 20:42:39 +01:00
|
|
|
return parser.parse_args()
|
2025-10-12 14:46:55 +02:00
|
|
|
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
def _resolve_device_and_compute(
|
|
|
|
|
args: argparse.Namespace,
|
|
|
|
|
) -> tuple[str, str]:
|
|
|
|
|
"""Resolve device and compute_type from args."""
|
2025-10-12 14:46:55 +02:00
|
|
|
device = args.device
|
|
|
|
|
compute_type = args.compute_type
|
|
|
|
|
if device == "auto":
|
|
|
|
|
device = "cpu"
|
|
|
|
|
if compute_type == "auto":
|
2026-03-17 22:47:42 +01:00
|
|
|
compute_type = "float16" if device == "cuda" else "float32"
|
2026-03-13 20:42:39 +01:00
|
|
|
return device, compute_type
|
2025-12-20 21:49:52 +01:00
|
|
|
|
2025-10-12 14:46:55 +02:00
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
def _run_progress_loop(
|
|
|
|
|
args: argparse.Namespace,
|
|
|
|
|
model: object,
|
|
|
|
|
inp: str,
|
|
|
|
|
total_duration: float | None,
|
|
|
|
|
) -> tuple[list[Any], object]:
|
|
|
|
|
"""Transcribe with live progress output."""
|
2025-10-12 14:46:55 +02:00
|
|
|
start_ts = time.time()
|
2026-03-17 22:47:42 +01:00
|
|
|
iter_segments, info = model.transcribe(inp, language=args.language)
|
2026-03-13 20:42:39 +01:00
|
|
|
collected: list[Any] = []
|
2025-10-12 14:46:55 +02:00
|
|
|
processed = 0.0
|
2026-03-13 20:42:39 +01:00
|
|
|
last_prt = 0.0
|
2025-10-12 14:46:55 +02:00
|
|
|
tty = sys.stderr.isatty()
|
2026-03-13 20:42:39 +01:00
|
|
|
|
2025-10-12 14:46:55 +02:00
|
|
|
for seg in iter_segments:
|
|
|
|
|
collected.append(seg)
|
|
|
|
|
if getattr(seg, "end", None) is not None:
|
2026-03-17 22:47:42 +01:00
|
|
|
processed = max(processed, float(seg.end))
|
2025-10-12 14:46:55 +02:00
|
|
|
now = time.time()
|
2026-03-17 22:47:42 +01:00
|
|
|
if not args.no_progress and (tty or (now - last_prt) >= _PROGRESS_THROTTLE_SEC):
|
2026-03-13 20:42:39 +01:00
|
|
|
last_prt = now
|
|
|
|
|
line = _format_progress_line(
|
|
|
|
|
processed,
|
|
|
|
|
total_duration,
|
|
|
|
|
now,
|
|
|
|
|
start_ts,
|
|
|
|
|
)
|
2025-10-12 14:46:55 +02:00
|
|
|
if tty:
|
2026-03-13 20:42:39 +01:00
|
|
|
logger.info("\r%s", line)
|
2025-10-12 14:46:55 +02:00
|
|
|
else:
|
2026-03-13 20:42:39 +01:00
|
|
|
logger.info("%s", line)
|
|
|
|
|
|
|
|
|
|
if not args.no_progress and tty:
|
|
|
|
|
logger.info("")
|
|
|
|
|
|
|
|
|
|
return collected, info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _format_progress_line(
|
|
|
|
|
processed: float,
|
|
|
|
|
total_duration: float | None,
|
|
|
|
|
now: float,
|
|
|
|
|
start_ts: float,
|
|
|
|
|
) -> str:
|
|
|
|
|
"""Format a progress line string."""
|
2026-03-17 22:47:42 +01:00
|
|
|
from _transcribe_output import hhmmss
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
if total_duration and total_duration > 0:
|
|
|
|
|
pct = max(
|
|
|
|
|
0.0,
|
|
|
|
|
min(
|
|
|
|
|
100.0,
|
|
|
|
|
(processed / total_duration) * 100.0,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
elapsed = now - start_ts
|
|
|
|
|
line = (
|
2026-03-21 17:51:36 +01:00
|
|
|
f"[PROGRESS] {hhmmss(processed)} / {hhmmss(total_duration)} ({pct:5.1f}%)"
|
2026-03-13 20:42:39 +01:00
|
|
|
)
|
|
|
|
|
if processed > 0:
|
|
|
|
|
rate = processed / max(1e-6, elapsed)
|
2026-03-17 22:47:42 +01:00
|
|
|
remaining = max(0.0, total_duration - processed)
|
2026-03-13 20:42:39 +01:00
|
|
|
eta = remaining / max(1e-6, rate)
|
|
|
|
|
if eta < _SECONDS_PER_DAY:
|
|
|
|
|
line += f" ETA ~{hhmmss(eta)}"
|
|
|
|
|
return line
|
|
|
|
|
return f"[PROGRESS] processed {hhmmss(processed)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_diarized_outputs(
|
|
|
|
|
args: argparse.Namespace,
|
|
|
|
|
inp: str,
|
|
|
|
|
outdir: Path,
|
|
|
|
|
base: str,
|
|
|
|
|
collected: list[Any],
|
|
|
|
|
) -> None:
|
|
|
|
|
"""Optionally diarize and write speaker outputs."""
|
|
|
|
|
if not args.diarize:
|
|
|
|
|
return
|
2026-03-17 22:47:42 +01:00
|
|
|
|
|
|
|
|
from _transcribe_diarize import diarize_segments
|
|
|
|
|
from _transcribe_output import (
|
|
|
|
|
write_rttm,
|
|
|
|
|
write_srt_with_speakers,
|
|
|
|
|
write_txt_with_speakers,
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
labels = diarize_segments(
|
|
|
|
|
inp,
|
|
|
|
|
collected,
|
|
|
|
|
num_speakers=args.num_speakers,
|
|
|
|
|
)
|
2026-03-17 22:47:42 +01:00
|
|
|
if labels is not None and len(labels) == len(collected):
|
2026-03-13 20:42:39 +01:00
|
|
|
diar_srt = str(outdir / (base + ".diar.srt"))
|
|
|
|
|
diar_txt = str(outdir / (base + ".diar.txt"))
|
|
|
|
|
rttm_path = str(outdir / (base + ".rttm"))
|
2026-03-17 22:47:42 +01:00
|
|
|
write_srt_with_speakers(collected, labels, diar_srt)
|
|
|
|
|
write_txt_with_speakers(collected, labels, diar_txt)
|
2026-03-13 20:42:39 +01:00
|
|
|
write_rttm(
|
|
|
|
|
collected,
|
|
|
|
|
labels,
|
|
|
|
|
rttm_path,
|
|
|
|
|
file_id=base,
|
|
|
|
|
)
|
|
|
|
|
logger.info("Wrote: %s", diar_txt)
|
|
|
|
|
logger.info("Wrote: %s", diar_srt)
|
|
|
|
|
logger.info("Wrote: %s", rttm_path)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(
|
2026-03-21 17:51:36 +01:00
|
|
|
"Diarization failed or returned mismatched labels; writing plain.",
|
2026-03-13 20:42:39 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
"""Run the main transcription pipeline."""
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format="%(message)s",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
args = _parse_args()
|
|
|
|
|
|
|
|
|
|
fw = _try_import("faster_whisper")
|
|
|
|
|
if fw is None:
|
|
|
|
|
logger.error(
|
2026-03-21 17:51:36 +01:00
|
|
|
"faster-whisper is not installed in this environment.",
|
2026-03-13 20:42:39 +01:00
|
|
|
)
|
|
|
|
|
return 2
|
|
|
|
|
|
|
|
|
|
inp_path = Path(args.input).resolve()
|
|
|
|
|
if not inp_path.exists():
|
|
|
|
|
logger.error("Input file not found: %s", inp_path)
|
|
|
|
|
return 2
|
|
|
|
|
|
|
|
|
|
inp = str(inp_path)
|
2026-03-17 22:47:42 +01:00
|
|
|
outdir = Path(args.outdir or str(inp_path.parent) or ".").resolve()
|
2026-03-13 20:42:39 +01:00
|
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
base = inp_path.stem
|
|
|
|
|
srt_path = str(outdir / (base + ".srt"))
|
|
|
|
|
txt_path = str(outdir / (base + ".txt"))
|
|
|
|
|
|
2026-03-17 22:47:42 +01:00
|
|
|
device, compute_type = _resolve_device_and_compute(args)
|
2026-03-13 20:42:39 +01:00
|
|
|
|
|
|
|
|
logger.info(
|
2026-03-21 17:51:36 +01:00
|
|
|
"Loading model='%s', device='%s', compute_type='%s'",
|
2026-03-13 20:42:39 +01:00
|
|
|
args.model,
|
|
|
|
|
device,
|
|
|
|
|
compute_type,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
model_path: str = args.model
|
|
|
|
|
if not Path(args.model).is_dir():
|
2026-03-17 22:47:42 +01:00
|
|
|
from _transcribe_model import (
|
|
|
|
|
download_model_with_progress,
|
2026-03-13 20:42:39 +01:00
|
|
|
)
|
|
|
|
|
|
2026-03-17 22:47:42 +01:00
|
|
|
model_path = download_model_with_progress(args.model)
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
ct2_logger = logging.getLogger("faster_whisper")
|
|
|
|
|
ct2_logger.setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
logger.info("Initializing model...")
|
|
|
|
|
model = fw.WhisperModel(
|
|
|
|
|
model_path,
|
|
|
|
|
device=device,
|
|
|
|
|
compute_type=compute_type,
|
|
|
|
|
)
|
|
|
|
|
logger.info("Model loaded successfully.")
|
|
|
|
|
|
2026-03-17 22:47:42 +01:00
|
|
|
from _transcribe_diarize import get_media_duration
|
|
|
|
|
from _transcribe_output import hhmmss
|
|
|
|
|
|
2026-03-13 20:42:39 +01:00
|
|
|
total_duration = get_media_duration(inp)
|
|
|
|
|
if total_duration:
|
|
|
|
|
logger.info(
|
|
|
|
|
"Media duration: %s",
|
|
|
|
|
hhmmss(total_duration),
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-17 22:47:42 +01:00
|
|
|
collected, info = _run_progress_loop(args, model, inp, total_duration)
|
2026-03-13 20:42:39 +01:00
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Detected language: %s (prob=%s)",
|
|
|
|
|
getattr(info, "language", None),
|
|
|
|
|
getattr(info, "language_probability", None),
|
|
|
|
|
)
|
|
|
|
|
logger.info("Segments: %d", len(collected))
|
|
|
|
|
|
2026-03-17 22:47:42 +01:00
|
|
|
_write_diarized_outputs(args, inp, outdir, base, collected)
|
|
|
|
|
|
|
|
|
|
from _transcribe_output import write_srt, write_txt
|
2025-10-12 14:46:55 +02:00
|
|
|
|
|
|
|
|
write_txt(collected, txt_path)
|
|
|
|
|
write_srt(collected, srt_path)
|
2026-03-13 20:42:39 +01:00
|
|
|
logger.info("Wrote: %s", txt_path)
|
|
|
|
|
logger.info("Wrote: %s", srt_path)
|
2025-10-12 14:46:55 +02:00
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(main())
|