init

2025-10-07 18:00:20 +02:00
commit 4793f1b183
11 changed files with 262 additions and 0 deletions
@@ -0,0 +1,36 @@
 # Local Voice Assistant (Docker Compose)
 This repository contains a minimal multi-container voice assistant composed of:
 - `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper.
 - `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS.
 - `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434).
 - `middleware` - FastAPI service exposing POST /chat that orchestrates the above services.
 Quick notes & assumptions
 - These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory.
 - `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model.
 - The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images.
 - The Coqui TTS service uses the `TTS` package and downloads German models on first run.
 Run locally with Docker Compose
 1. Build and start:
 ```bash
 docker-compose up --build
 ```
 2. Example request to the middleware:
 ```bash
 curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav
 ```
 The `response.wav` will contain the German TTS response.
 Next steps / improvements
 - Add authentication between services.
 - Add healthchecks and readiness probes.
 - Add model selection, caching, and GPU support where available.
 - Replace Ollama placeholder with a validated model name and response parsing.
@@ -0,0 +1,18 @@
 FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
 	ffmpeg \
 	libsndfile1 \
 	build-essential \
 	git \
 	wget \
 && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip setuptools wheel && \
 	pip install --no-cache-dir -r requirements.txt
 COPY server.py ./
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"]
@@ -0,0 +1,6 @@
 fastapi==0.100.0
 uvicorn[standard]==0.22.0
 TTS==0.12.0
 torch==2.2.0
 soundfile==0.12.1
 numpy==1.26.0
@@ -0,0 +1,28 @@
 from fastapi import FastAPI, HTTPException
 from fastapi import Body
 from fastapi.responses import FileResponse, JSONResponse
 from TTS.api import TTS
 import tempfile
 import os
 app = FastAPI()
 # Load a German-capable model. Model may be downloaded on first run.
 tts = TTS(model_name="tts_models/de/thorsten_hsmm")
@app.post("/speak")
 def speak(payload: dict = Body(...)):
    text = payload.get("text")
    language = payload.get("language", "de")
    if not text:
        raise HTTPException(status_code=400, detail="Missing 'text' in body")
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    try:
        tts.tts_to_file(text=text, speaker=None, language=language, file_path=path)
        return FileResponse(path, media_type="audio/wav", filename="response.wav")
    finally:
        # FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup.
        pass
@@ -0,0 +1,42 @@
 version: '3.8'
 services:
  whisper:
    build: ./whisper
    image: lva_whisper:latest
    ports:
      - "8001:8001"
    restart: unless-stopped
    volumes:
      - ./whisper/models:/models
  coquitts:
    build: ./coquitts
    image: lva_coquitts:latest
    ports:
      - "8002:8002"
    restart: unless-stopped
  ollama:
    # This is a placeholder image; ensure you have an Ollama-compatible image and models available.
    image: ollama/ollama:latest
    container_name: ollama
    ports:
      - "11434:11434"
    restart: unless-stopped
    volumes:
      - ./ollama-data:/root/.ollama
  middleware:
    build: ./middleware
    image: lva_middleware:latest
    ports:
      - "8000:8000"
    depends_on:
      - whisper
      - coquitts
      - ollama
    restart: unless-stopped
 networks:
  default:
    name: lva_network
@@ -0,0 +1,15 @@
 FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
 	ffmpeg \
 	libsndfile1 \
 && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip setuptools wheel && \
 	pip install --no-cache-dir -r requirements.txt
 COPY server.py ./
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,4 @@
 fastapi==0.100.0
 uvicorn[standard]==0.22.0
 httpx==0.24.1
 python-multipart==0.0.6
@@ -0,0 +1,56 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 import httpx
 import tempfile
 import shutil
 import asyncio
 app = FastAPI()
 WHISPER_URL = "http://whisper:8001/transcribe"
 COQUITTS_URL = "http://coquitts:8002/speak"
 OLLAMA_URL = "http://ollama:11434/v1/complete"
@app.post("/chat")
 async def chat(file: UploadFile = File(...)):
    if not file.content_type.startswith("audio"):
        raise HTTPException(status_code=400, detail="File must be audio")
    # save file to temp
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        contents = await file.read()
        tmp.write(contents)
        tmp.flush()
        tmp_path = tmp.name
    async with httpx.AsyncClient() as client:
        # Send audio to whisper
        with open(tmp_path, "rb") as f:
            files = {"file": ("audio.wav", f, "audio/wav")}
            r = await client.post(WHISPER_URL, files=files, timeout=120.0)
        if r.status_code != 200:
            raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}")
        text = r.json().get("text", "")
        # Send text to ollama for reasoning
        # We assume Ollama HTTP API accepts JSON {"model":"<model>", "prompt":"..."}
        ollama_payload = {"model": "llama2", "prompt": text}
        ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0)
        if ro.status_code != 200:
            raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}")
        answer_json = ro.json()
        # Depending on API shape, try to extract text
        answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json)
        # Send answer to coquitts to generate German audio
        coquitts_payload = {"text": answer_text, "language": "de"}
        co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0)
        if co.status_code != 200:
            raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}")
        # stream the audio back
        return StreamingResponse(co.aiter_bytes(), media_type="audio/wav")
@@ -0,0 +1,19 @@
 FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1
 WORKDIR /app
 # Install system dependencies for audio handling and build tools
 RUN apt-get update && apt-get install -y --no-install-recommends \
 	ffmpeg \
 	libsndfile1 \
 	build-essential \
 	git \
 	wget \
 && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip setuptools wheel && \
 	pip install --no-cache-dir -r requirements.txt
 COPY server.py ./
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
@@ -0,0 +1,6 @@
 fastapi==0.100.0
 uvicorn[standard]==0.22.0
 whisper==20230314
 pydub==0.25.1
 aiofiles==23.1.0
 python-multipart==0.0.6
@@ -0,0 +1,32 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
 import whisper
 import tempfile
 import shutil
 app = FastAPI()
 model = whisper.load_model("small")
@app.post("/transcribe")
 async def transcribe(file: UploadFile = File(...)):
    if not file.content_type.startswith("audio"):
        raise HTTPException(status_code=400, detail="File must be audio")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        contents = await file.read()
        tmp.write(contents)
        tmp.flush()
        tmp_path = tmp.name
    try:
        result = model.transcribe(tmp_path, language=None)
        text = result.get("text", "")
    finally:
        try:
            shutil.os.remove(tmp_path)
        except Exception:
            pass
    return JSONResponse({"text": text})