This commit is contained in:
YannAhlgrim
2025-10-08 15:23:23 +02:00
parent b59f52cf86
commit 5e6eae61cc
8 changed files with 288 additions and 70 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
fastapi==0.100.0
uvicorn[standard]==0.22.0
whisper==1.1.10
openai-whisper
pydub==0.25.1
aiofiles==23.1.0
python-multipart==0.0.6
+73 -12
View File
@@ -3,30 +3,91 @@ from fastapi.responses import JSONResponse
import whisper
import tempfile
import shutil
import os
import logging
from pydub import AudioSegment
logging.basicConfig(level=logging.INFO)
app = FastAPI()
model = whisper.load_model("small")
# Load model at startup
try:
model = whisper.load_model("small")
except Exception:
logging.exception("Failed to load Whisper model")
# re-raise so container fails fast if model can't be loaded
raise
def convert_to_wav(src_path: str) -> str:
"""Convert an audio file (webm/ogg/mp3/...) to a 16 kHz mono WAV file using pydub/ffmpeg.
Returns path to the new WAV file (caller is responsible for cleanup).
"""
audio = AudioSegment.from_file(src_path)
audio = audio.set_frame_rate(16000).set_channels(1)
wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
wav_tmp.close()
audio.export(wav_tmp.name, format="wav")
return wav_tmp.name
@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)):
if not file.content_type.startswith("audio"):
if not file.content_type or not file.content_type.startswith("audio"):
raise HTTPException(status_code=400, detail="File must be audio")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
# preserve original extension if possible
filename = file.filename or "upload"
ext = os.path.splitext(filename)[1] or ""
if not ext:
# try to infer common extension from content-type
if "webm" in file.content_type:
ext = ".webm"
elif "ogg" in file.content_type or "opus" in file.content_type:
ext = ".ogg"
elif "mpeg" in file.content_type or "mp3" in file.content_type:
ext = ".mp3"
else:
ext = ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
contents = await file.read()
tmp.write(contents)
tmp.flush()
tmp_path = tmp.name
try:
result = model.transcribe(tmp_path, language=None)
text = result.get("text", "")
finally:
try:
shutil.os.remove(tmp_path)
except Exception:
pass
logging.info("Received upload %s (%d bytes, content-type=%s)", filename, os.path.getsize(tmp_path), file.content_type)
return JSONResponse({"text": text})
# If the uploaded file is not a WAV, convert it to WAV first to ensure ffmpeg/pydub compatibility.
wav_path = tmp_path
converted = False
try:
if not tmp_path.lower().endswith('.wav'):
try:
wav_path = convert_to_wav(tmp_path)
converted = True
logging.info("Converted to wav: %s (size=%d)", wav_path, os.path.getsize(wav_path))
except Exception as e:
# conversion failed; return a helpful error including ffmpeg/pydub message
logging.exception("Failed to convert uploaded audio to wav")
# try to surface the underlying error text
raise HTTPException(status_code=400, detail=f"Failed to convert audio: {e}")
try:
result = model.transcribe(wav_path, language=None)
text = result.get("text", "")
except RuntimeError as e:
# likely ffmpeg failed while loading audio; include error message for debugging
logging.exception("Whisper failed to transcribe audio")
raise HTTPException(status_code=500, detail=str(e))
return JSONResponse({"text": text})
finally:
# cleanup temp files
for path in {tmp_path, wav_path}:
try:
if path and os.path.exists(path):
os.remove(path)
except Exception:
logging.exception("Failed to remove temp file %s", path)