yt_to_srt() {
local url="$1"
local output_base="$2"
local language="${3:-en}"
yt-dlp -x --audio-format wav --postprocessor-args "-ar 16000" -o "$output_base.wav" "$url"
whisper-cli --language "$language" --model "$WHISPER_MODEL" --split-on-word --max-len 65 --output-vtt --output-file "$output_base" --file "$output_base.wav"
rm "$output_base.wav"
}
file_to_srt() {
local filepath="$1"
local language="${2:-en}"
local filename=$(basename "$filepath")
local filename_no_ext="${filename%.*}"
local output_base="$filename_no_ext"
local temp_wav="$output_base.wav"
ffmpeg -i "$filepath" -vn -acodec pcm_s16le -ar 16000 -ac 1 "$temp_wav"
whisper-cli --language "$language" --model "$WHISPER_MODEL" --split-on-word --max-len 65 --output-vtt --output-file "$output_base" --file "$temp_wav"
rm "$temp_wav"
}
plus additional bootstrap script for large-v3-turbo model from my chez-moi dotfiles: #!/bin/bash
# Download whisper.cpp models from Hugging Face (runs once per machine).
set -euo pipefail
MODELS_DIR="$HOME/whisper-models"
BASE_URL="https://huggingface.co/ggerganov/whisper.cpp/resolve/main"
MODELS=("ggml-large-v3-turbo.bin" "ggml-tiny.bin")
mkdir -p "$MODELS_DIR"
for model in "${MODELS[@]}"; do
if [ ! -f "$MODELS_DIR/$model" ]; then
echo "Downloading $model..."
curl -L --progress-bar -o "$MODELS_DIR/$model" "$BASE_URL/$model"
else
echo "$model already exists, skipping."
fi
done
echo "Whisper models ready at $MODELS_DIR"I think I’m gonna go read a book.
sudo apt update && sudo apt install -y ffmpeg python3-pip python3-venv && git clone https://github.com/kouhxp/yapsnap.git && cd yapsnap && python3 -m venv ~/yapsnap-venv && source ~/yapsnap-venv/bin/activate && pip install --upgrade pip && pip install .
On a 32GB ThinkPad X13, a 21 minute YouTube video was processed by yapsnap under 2 minutes.
Very well done!
I guess if it encourages you to install and figure out how to use ffmpeg, yt-dlp, kroko, numpy, and onnx that's a good thing. Sometimes just knowing a thing is possible is a huge benefit.
My biggest challenge is finding a proper language model that is fast enough and accurate enough since I have to caption about 600 hours of video per week and I preferably want to run all of this on a tiny server (2 cores 4 GB memory). This tool could easily do that with the kroko model but I'll have to test if the accuracy is good enough.
Also in my own scripts I'm using ffmpeg to download just the audio of the videos that I want to caption, which saves a lot of bandwith and speeds up the whole process. As far as I can see this tool doesn't do that, that would be a nice functionality to add, plus an option to turn the output into a working .srt file.
yapsnap "https://www.youtube.com/watch?v=NzKJ-xO-VhE" --diarize
SPEAKER_00 [00:00]: Welcome to the show.
SPEAKER_01 [00:03]: Glad to be here, thanks for having me.
SPEAKER_00 [00:08]: Let's get started.