This commit is contained in:
YannAhlgrim
2025-10-07 18:00:20 +02:00
commit 4793f1b183
11 changed files with 262 additions and 0 deletions
+36
View File
@@ -0,0 +1,36 @@
# Local Voice Assistant (Docker Compose)
This repository contains a minimal multi-container voice assistant composed of:
- `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper.
- `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS.
- `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434).
- `middleware` - FastAPI service exposing POST /chat that orchestrates the above services.
Quick notes & assumptions
- These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory.
- `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model.
- The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images.
- The Coqui TTS service uses the `TTS` package and downloads German models on first run.
Run locally with Docker Compose
1. Build and start:
```bash
docker-compose up --build
```
2. Example request to the middleware:
```bash
curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav
```
The `response.wav` will contain the German TTS response.
Next steps / improvements
- Add authentication between services.
- Add healthchecks and readiness probes.
- Add model selection, caching, and GPU support where available.
- Replace Ollama placeholder with a validated model name and response parsing.
+18
View File
@@ -0,0 +1,18 @@
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
libsndfile1 \
build-essential \
git \
wget \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel && \
pip install --no-cache-dir -r requirements.txt
COPY server.py ./
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"]
+6
View File
@@ -0,0 +1,6 @@
fastapi==0.100.0
uvicorn[standard]==0.22.0
TTS==0.12.0
torch==2.2.0
soundfile==0.12.1
numpy==1.26.0
+28
View File
@@ -0,0 +1,28 @@
from fastapi import FastAPI, HTTPException
from fastapi import Body
from fastapi.responses import FileResponse, JSONResponse
from TTS.api import TTS
import tempfile
import os
app = FastAPI()
# Load a German-capable model. Model may be downloaded on first run.
tts = TTS(model_name="tts_models/de/thorsten_hsmm")
@app.post("/speak")
def speak(payload: dict = Body(...)):
text = payload.get("text")
language = payload.get("language", "de")
if not text:
raise HTTPException(status_code=400, detail="Missing 'text' in body")
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
try:
tts.tts_to_file(text=text, speaker=None, language=language, file_path=path)
return FileResponse(path, media_type="audio/wav", filename="response.wav")
finally:
# FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup.
pass
+42
View File
@@ -0,0 +1,42 @@
version: '3.8'
services:
whisper:
build: ./whisper
image: lva_whisper:latest
ports:
- "8001:8001"
restart: unless-stopped
volumes:
- ./whisper/models:/models
coquitts:
build: ./coquitts
image: lva_coquitts:latest
ports:
- "8002:8002"
restart: unless-stopped
ollama:
# This is a placeholder image; ensure you have an Ollama-compatible image and models available.
image: ollama/ollama:latest
container_name: ollama
ports:
- "11434:11434"
restart: unless-stopped
volumes:
- ./ollama-data:/root/.ollama
middleware:
build: ./middleware
image: lva_middleware:latest
ports:
- "8000:8000"
depends_on:
- whisper
- coquitts
- ollama
restart: unless-stopped
networks:
default:
name: lva_network
+15
View File
@@ -0,0 +1,15 @@
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel && \
pip install --no-cache-dir -r requirements.txt
COPY server.py ./
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
+4
View File
@@ -0,0 +1,4 @@
fastapi==0.100.0
uvicorn[standard]==0.22.0
httpx==0.24.1
python-multipart==0.0.6
+56
View File
@@ -0,0 +1,56 @@
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
import httpx
import tempfile
import shutil
import asyncio
app = FastAPI()
WHISPER_URL = "http://whisper:8001/transcribe"
COQUITTS_URL = "http://coquitts:8002/speak"
OLLAMA_URL = "http://ollama:11434/v1/complete"
@app.post("/chat")
async def chat(file: UploadFile = File(...)):
if not file.content_type.startswith("audio"):
raise HTTPException(status_code=400, detail="File must be audio")
# save file to temp
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
contents = await file.read()
tmp.write(contents)
tmp.flush()
tmp_path = tmp.name
async with httpx.AsyncClient() as client:
# Send audio to whisper
with open(tmp_path, "rb") as f:
files = {"file": ("audio.wav", f, "audio/wav")}
r = await client.post(WHISPER_URL, files=files, timeout=120.0)
if r.status_code != 200:
raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}")
text = r.json().get("text", "")
# Send text to ollama for reasoning
# We assume Ollama HTTP API accepts JSON {"model":"<model>", "prompt":"..."}
ollama_payload = {"model": "llama2", "prompt": text}
ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0)
if ro.status_code != 200:
raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}")
answer_json = ro.json()
# Depending on API shape, try to extract text
answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json)
# Send answer to coquitts to generate German audio
coquitts_payload = {"text": answer_text, "language": "de"}
co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0)
if co.status_code != 200:
raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}")
# stream the audio back
return StreamingResponse(co.aiter_bytes(), media_type="audio/wav")
+19
View File
@@ -0,0 +1,19 @@
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
WORKDIR /app
# Install system dependencies for audio handling and build tools
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
libsndfile1 \
build-essential \
git \
wget \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt ./
RUN python -m pip install --upgrade pip setuptools wheel && \
pip install --no-cache-dir -r requirements.txt
COPY server.py ./
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
+6
View File
@@ -0,0 +1,6 @@
fastapi==0.100.0
uvicorn[standard]==0.22.0
whisper==20230314
pydub==0.25.1
aiofiles==23.1.0
python-multipart==0.0.6
+32
View File
@@ -0,0 +1,32 @@
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import whisper
import tempfile
import shutil
app = FastAPI()
model = whisper.load_model("small")
@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)):
if not file.content_type.startswith("audio"):
raise HTTPException(status_code=400, detail="File must be audio")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
contents = await file.read()
tmp.write(contents)
tmp.flush()
tmp_path = tmp.name
try:
result = model.transcribe(tmp_path, language=None)
text = result.get("text", "")
finally:
try:
shutil.os.remove(tmp_path)
except Exception:
pass
return JSONResponse({"text": text})