init

2025-10-07 18:00:20 +02:00
commit 4793f1b183
11 changed files with 262 additions and 0 deletions
@@ -0,0 +1,36 @@
+# Local Voice Assistant (Docker Compose)
+
+This repository contains a minimal multi-container voice assistant composed of:
+
+- `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper.
+- `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS.
+- `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434).
+- `middleware` - FastAPI service exposing POST /chat that orchestrates the above services.
+
+Quick notes & assumptions
+- These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory.
+- `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model.
+- The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images.
+- The Coqui TTS service uses the `TTS` package and downloads German models on first run.
+
+Run locally with Docker Compose
+
+1. Build and start:
+
+```bash
+docker-compose up --build
+```
+
+2. Example request to the middleware:
+
+```bash
+curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav
+```
+
+The `response.wav` will contain the German TTS response.
+
+Next steps / improvements
+- Add authentication between services.
+- Add healthchecks and readiness probes.
+- Add model selection, caching, and GPU support where available.
+- Replace Ollama placeholder with a validated model name and response parsing.
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg \
+	libsndfile1 \
+	build-essential \
+	git \
+	wget \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN python -m pip install --upgrade pip setuptools wheel && \
+	pip install --no-cache-dir -r requirements.txt
+
+COPY server.py ./
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"]
@@ -0,0 +1,6 @@
+fastapi==0.100.0
+uvicorn[standard]==0.22.0
+TTS==0.12.0
+torch==2.2.0
+soundfile==0.12.1
+numpy==1.26.0
@@ -0,0 +1,28 @@
+from fastapi import FastAPI, HTTPException
+from fastapi import Body
+from fastapi.responses import FileResponse, JSONResponse
+from TTS.api import TTS
+import tempfile
+import os
+
+app = FastAPI()
+
+# Load a German-capable model. Model may be downloaded on first run.
+tts = TTS(model_name="tts_models/de/thorsten_hsmm")
+
+
+@app.post("/speak")
+def speak(payload: dict = Body(...)):
+    text = payload.get("text")
+    language = payload.get("language", "de")
+    if not text:
+        raise HTTPException(status_code=400, detail="Missing 'text' in body")
+
+    fd, path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    try:
+        tts.tts_to_file(text=text, speaker=None, language=language, file_path=path)
+        return FileResponse(path, media_type="audio/wav", filename="response.wav")
+    finally:
+        # FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup.
+        pass
@@ -0,0 +1,42 @@
+version: '3.8'
+services:
+  whisper:
+    build: ./whisper
+    image: lva_whisper:latest
+    ports:
+      - "8001:8001"
+    restart: unless-stopped
+    volumes:
+      - ./whisper/models:/models
+
+  coquitts:
+    build: ./coquitts
+    image: lva_coquitts:latest
+    ports:
+      - "8002:8002"
+    restart: unless-stopped
+
+  ollama:
+    # This is a placeholder image; ensure you have an Ollama-compatible image and models available.
+    image: ollama/ollama:latest
+    container_name: ollama
+    ports:
+      - "11434:11434"
+    restart: unless-stopped
+    volumes:
+      - ./ollama-data:/root/.ollama
+
+  middleware:
+    build: ./middleware
+    image: lva_middleware:latest
+    ports:
+      - "8000:8000"
+    depends_on:
+      - whisper
+      - coquitts
+      - ollama
+    restart: unless-stopped
+
+networks:
+  default:
+    name: lva_network
@@ -0,0 +1,15 @@
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg \
+	libsndfile1 \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN python -m pip install --upgrade pip setuptools wheel && \
+	pip install --no-cache-dir -r requirements.txt
+
+COPY server.py ./
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,4 @@
+fastapi==0.100.0
+uvicorn[standard]==0.22.0
+httpx==0.24.1
+python-multipart==0.0.6
@@ -0,0 +1,56 @@
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
+import httpx
+import tempfile
+import shutil
+import asyncio
+
+app = FastAPI()
+
+WHISPER_URL = "http://whisper:8001/transcribe"
+COQUITTS_URL = "http://coquitts:8002/speak"
+OLLAMA_URL = "http://ollama:11434/v1/complete"
+
+
+@app.post("/chat")
+async def chat(file: UploadFile = File(...)):
+    if not file.content_type.startswith("audio"):
+        raise HTTPException(status_code=400, detail="File must be audio")
+
+    # save file to temp
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        contents = await file.read()
+        tmp.write(contents)
+        tmp.flush()
+        tmp_path = tmp.name
+
+    async with httpx.AsyncClient() as client:
+        # Send audio to whisper
+        with open(tmp_path, "rb") as f:
+            files = {"file": ("audio.wav", f, "audio/wav")}
+            r = await client.post(WHISPER_URL, files=files, timeout=120.0)
+
+        if r.status_code != 200:
+            raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}")
+
+        text = r.json().get("text", "")
+
+        # Send text to ollama for reasoning
+        # We assume Ollama HTTP API accepts JSON {"model":"<model>", "prompt":"..."}
+        ollama_payload = {"model": "llama2", "prompt": text}
+        ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0)
+        if ro.status_code != 200:
+            raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}")
+
+        answer_json = ro.json()
+        # Depending on API shape, try to extract text
+        answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json)
+
+        # Send answer to coquitts to generate German audio
+        coquitts_payload = {"text": answer_text, "language": "de"}
+        co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0)
+        if co.status_code != 200:
+            raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}")
+
+        # stream the audio back
+        return StreamingResponse(co.aiter_bytes(), media_type="audio/wav")
@@ -0,0 +1,19 @@
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+
+# Install system dependencies for audio handling and build tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg \
+	libsndfile1 \
+	build-essential \
+	git \
+	wget \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN python -m pip install --upgrade pip setuptools wheel && \
+	pip install --no-cache-dir -r requirements.txt
+
+COPY server.py ./
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
@@ -0,0 +1,6 @@
+fastapi==0.100.0
+uvicorn[standard]==0.22.0
+whisper==20230314
+pydub==0.25.1
+aiofiles==23.1.0
+python-multipart==0.0.6
@@ -0,0 +1,32 @@
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import whisper
+import tempfile
+import shutil
+
+app = FastAPI()
+
+model = whisper.load_model("small")
+
+
+@app.post("/transcribe")
+async def transcribe(file: UploadFile = File(...)):
+    if not file.content_type.startswith("audio"):
+        raise HTTPException(status_code=400, detail="File must be audio")
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        contents = await file.read()
+        tmp.write(contents)
+        tmp.flush()
+        tmp_path = tmp.name
+
+    try:
+        result = model.transcribe(tmp_path, language=None)
+        text = result.get("text", "")
+    finally:
+        try:
+            shutil.os.remove(tmp_path)
+        except Exception:
+            pass
+
+    return JSONResponse({"text": text})