commit 4793f1b18326e5a7ef6912e311fc70c26a8dfe1a Author: YannAhlgrim Date: Tue Oct 7 18:00:20 2025 +0200 init diff --git a/README.md b/README.md new file mode 100644 index 0000000..852557e --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Local Voice Assistant (Docker Compose) + +This repository contains a minimal multi-container voice assistant composed of: + +- `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper. +- `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS. +- `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434). +- `middleware` - FastAPI service exposing POST /chat that orchestrates the above services. + +Quick notes & assumptions +- These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory. +- `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model. +- The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images. +- The Coqui TTS service uses the `TTS` package and downloads German models on first run. + +Run locally with Docker Compose + +1. Build and start: + +```bash +docker-compose up --build +``` + +2. Example request to the middleware: + +```bash +curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav +``` + +The `response.wav` will contain the German TTS response. + +Next steps / improvements +- Add authentication between services. +- Add healthchecks and readiness probes. +- Add model selection, caching, and GPU support where available. +- Replace Ollama placeholder with a validated model name and response parsing. diff --git a/coquitts/Dockerfile b/coquitts/Dockerfile new file mode 100644 index 0000000..3c0ef51 --- /dev/null +++ b/coquitts/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim +ENV PYTHONUNBUFFERED=1 +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + libsndfile1 \ + build-essential \ + git \ + wget \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN python -m pip install --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r requirements.txt + +COPY server.py ./ +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"] diff --git a/coquitts/requirements.txt b/coquitts/requirements.txt new file mode 100644 index 0000000..7d87902 --- /dev/null +++ b/coquitts/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.100.0 +uvicorn[standard]==0.22.0 +TTS==0.12.0 +torch==2.2.0 +soundfile==0.12.1 +numpy==1.26.0 diff --git a/coquitts/server.py b/coquitts/server.py new file mode 100644 index 0000000..478a5cf --- /dev/null +++ b/coquitts/server.py @@ -0,0 +1,28 @@ +from fastapi import FastAPI, HTTPException +from fastapi import Body +from fastapi.responses import FileResponse, JSONResponse +from TTS.api import TTS +import tempfile +import os + +app = FastAPI() + +# Load a German-capable model. Model may be downloaded on first run. +tts = TTS(model_name="tts_models/de/thorsten_hsmm") + + +@app.post("/speak") +def speak(payload: dict = Body(...)): + text = payload.get("text") + language = payload.get("language", "de") + if not text: + raise HTTPException(status_code=400, detail="Missing 'text' in body") + + fd, path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + try: + tts.tts_to_file(text=text, speaker=None, language=language, file_path=path) + return FileResponse(path, media_type="audio/wav", filename="response.wav") + finally: + # FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup. + pass diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..23b86fb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,42 @@ +version: '3.8' +services: + whisper: + build: ./whisper + image: lva_whisper:latest + ports: + - "8001:8001" + restart: unless-stopped + volumes: + - ./whisper/models:/models + + coquitts: + build: ./coquitts + image: lva_coquitts:latest + ports: + - "8002:8002" + restart: unless-stopped + + ollama: + # This is a placeholder image; ensure you have an Ollama-compatible image and models available. + image: ollama/ollama:latest + container_name: ollama + ports: + - "11434:11434" + restart: unless-stopped + volumes: + - ./ollama-data:/root/.ollama + + middleware: + build: ./middleware + image: lva_middleware:latest + ports: + - "8000:8000" + depends_on: + - whisper + - coquitts + - ollama + restart: unless-stopped + +networks: + default: + name: lva_network diff --git a/middleware/Dockerfile b/middleware/Dockerfile new file mode 100644 index 0000000..16454b2 --- /dev/null +++ b/middleware/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim +ENV PYTHONUNBUFFERED=1 +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN python -m pip install --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r requirements.txt + +COPY server.py ./ +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/middleware/requirements.txt b/middleware/requirements.txt new file mode 100644 index 0000000..37bff31 --- /dev/null +++ b/middleware/requirements.txt @@ -0,0 +1,4 @@ +fastapi==0.100.0 +uvicorn[standard]==0.22.0 +httpx==0.24.1 +python-multipart==0.0.6 diff --git a/middleware/server.py b/middleware/server.py new file mode 100644 index 0000000..3dc23e2 --- /dev/null +++ b/middleware/server.py @@ -0,0 +1,56 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import StreamingResponse, JSONResponse +import httpx +import tempfile +import shutil +import asyncio + +app = FastAPI() + +WHISPER_URL = "http://whisper:8001/transcribe" +COQUITTS_URL = "http://coquitts:8002/speak" +OLLAMA_URL = "http://ollama:11434/v1/complete" + + +@app.post("/chat") +async def chat(file: UploadFile = File(...)): + if not file.content_type.startswith("audio"): + raise HTTPException(status_code=400, detail="File must be audio") + + # save file to temp + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: + contents = await file.read() + tmp.write(contents) + tmp.flush() + tmp_path = tmp.name + + async with httpx.AsyncClient() as client: + # Send audio to whisper + with open(tmp_path, "rb") as f: + files = {"file": ("audio.wav", f, "audio/wav")} + r = await client.post(WHISPER_URL, files=files, timeout=120.0) + + if r.status_code != 200: + raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}") + + text = r.json().get("text", "") + + # Send text to ollama for reasoning + # We assume Ollama HTTP API accepts JSON {"model":"", "prompt":"..."} + ollama_payload = {"model": "llama2", "prompt": text} + ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0) + if ro.status_code != 200: + raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}") + + answer_json = ro.json() + # Depending on API shape, try to extract text + answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json) + + # Send answer to coquitts to generate German audio + coquitts_payload = {"text": answer_text, "language": "de"} + co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0) + if co.status_code != 200: + raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}") + + # stream the audio back + return StreamingResponse(co.aiter_bytes(), media_type="audio/wav") diff --git a/whisper/Dockerfile b/whisper/Dockerfile new file mode 100644 index 0000000..f76241a --- /dev/null +++ b/whisper/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim +ENV PYTHONUNBUFFERED=1 +WORKDIR /app + +# Install system dependencies for audio handling and build tools +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + libsndfile1 \ + build-essential \ + git \ + wget \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN python -m pip install --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r requirements.txt + +COPY server.py ./ +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"] diff --git a/whisper/requirements.txt b/whisper/requirements.txt new file mode 100644 index 0000000..dc93bff --- /dev/null +++ b/whisper/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.100.0 +uvicorn[standard]==0.22.0 +whisper==20230314 +pydub==0.25.1 +aiofiles==23.1.0 +python-multipart==0.0.6 diff --git a/whisper/server.py b/whisper/server.py new file mode 100644 index 0000000..5791250 --- /dev/null +++ b/whisper/server.py @@ -0,0 +1,32 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import whisper +import tempfile +import shutil + +app = FastAPI() + +model = whisper.load_model("small") + + +@app.post("/transcribe") +async def transcribe(file: UploadFile = File(...)): + if not file.content_type.startswith("audio"): + raise HTTPException(status_code=400, detail="File must be audio") + + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: + contents = await file.read() + tmp.write(contents) + tmp.flush() + tmp_path = tmp.name + + try: + result = model.transcribe(tmp_path, language=None) + text = result.get("text", "") + finally: + try: + shutil.os.remove(tmp_path) + except Exception: + pass + + return JSONResponse({"text": text})