Transcribe audio in minutes with WhisperV3

Accelerated by Flash Attention v2 + Transformers

One of the easiest and cheapest way to host an on-demand API is via Modal.

Once you create a free account and download the Python client, copy or download the base code below. It's on github too: Code

 modal_app.py 
from modal import Image, Stub, method, NetworkFileSystem, asgi_app
from fastapi import Request, FastAPI
import tempfile
import time

MODEL_DIR = "/model"

web_app = FastAPI()

def download_model():
    from huggingface_hub import snapshot_download
    snapshot_download("openai/whisper-large-v3", local_dir=MODEL_DIR)


image = (
    Image.from_registry("nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04", add_python="3.9")
    .apt_install("git","ffmpeg")
    .pip_install(
        "transformers",
        "ninja",
        "packaging",
        "wheel",
          "torch",
        "hf-transfer~=0.1",
        "ffmpeg-python",
    ).run_commands("python -m pip install flash-attn --no-build-isolation", gpu="A10G")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_function(
        download_model,
    )
)

stub = Stub("whisper-v3-demo", image=image)
stub.net_file_system = NetworkFileSystem.new()

@stub.cls(
    gpu="A10G",
    allow_concurrent_inputs=80,
    container_idle_timeout=40,
    network_file_systems={"/audio_files": stub.net_file_system},
)
class WhisperV3:
    def __enter__(self):
        import torch
        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            MODEL_DIR,
            torch_dtype=self.torch_dtype,
            use_safetensors=True,
            use_flash_attention_2=True,
        )
        processor = AutoProcessor.from_pretrained(MODEL_DIR)
        model.to(self.device)
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=24,
            return_timestamps=True,
            torch_dtype=self.torch_dtype,
            model_kwargs={"use_flash_attention_2": True},
            device=0,
        )

    @method()
    def generate(self, audio: bytes):
        fp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        fp.write(audio)
        fp.close()
        start = time.time()
        output = self.pipe(
            fp.name, chunk_length_s=30, batch_size=24, return_timestamps=True
        )
        elapsed = time.time() - start
        return output, elapsed

@stub.function()
@web_app.post("/")
async def transcribe(request: Request):
    form = await request.form()
    audio = await form["audio"].read()
    output, elapsed= WhisperV3().generate.remote(audio)
    return output, elapsed

@stub.function()
@asgi_app()
def entrypoint():
    return web_app

After authenticating with the Modal CLI, run this in your terminal:

$modal deploy modal_app.py

Now you can make requests! Remember to fill in the missing info:

$curl -X POST -F "audio=@<file>" https://<org_name>--whisper-v3-demo-entrypoint.modal.run

Made With ❤️ by Arihan Varanasi