init
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
# Local Voice Assistant (Docker Compose)
|
||||
|
||||
This repository contains a minimal multi-container voice assistant composed of:
|
||||
|
||||
- `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper.
|
||||
- `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS.
|
||||
- `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434).
|
||||
- `middleware` - FastAPI service exposing POST /chat that orchestrates the above services.
|
||||
|
||||
Quick notes & assumptions
|
||||
- These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory.
|
||||
- `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model.
|
||||
- The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images.
|
||||
- The Coqui TTS service uses the `TTS` package and downloads German models on first run.
|
||||
|
||||
Run locally with Docker Compose
|
||||
|
||||
1. Build and start:
|
||||
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
2. Example request to the middleware:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav
|
||||
```
|
||||
|
||||
The `response.wav` will contain the German TTS response.
|
||||
|
||||
Next steps / improvements
|
||||
- Add authentication between services.
|
||||
- Add healthchecks and readiness probes.
|
||||
- Add model selection, caching, and GPU support where available.
|
||||
- Replace Ollama placeholder with a validated model name and response parsing.
|
||||
@@ -0,0 +1,18 @@
|
||||
FROM python:3.11-slim
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
libsndfile1 \
|
||||
build-essential \
|
||||
git \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt ./
|
||||
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY server.py ./
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"]
|
||||
@@ -0,0 +1,6 @@
|
||||
fastapi==0.100.0
|
||||
uvicorn[standard]==0.22.0
|
||||
TTS==0.12.0
|
||||
torch==2.2.0
|
||||
soundfile==0.12.1
|
||||
numpy==1.26.0
|
||||
@@ -0,0 +1,28 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import Body
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from TTS.api import TTS
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Load a German-capable model. Model may be downloaded on first run.
|
||||
tts = TTS(model_name="tts_models/de/thorsten_hsmm")
|
||||
|
||||
|
||||
@app.post("/speak")
|
||||
def speak(payload: dict = Body(...)):
|
||||
text = payload.get("text")
|
||||
language = payload.get("language", "de")
|
||||
if not text:
|
||||
raise HTTPException(status_code=400, detail="Missing 'text' in body")
|
||||
|
||||
fd, path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(fd)
|
||||
try:
|
||||
tts.tts_to_file(text=text, speaker=None, language=language, file_path=path)
|
||||
return FileResponse(path, media_type="audio/wav", filename="response.wav")
|
||||
finally:
|
||||
# FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup.
|
||||
pass
|
||||
@@ -0,0 +1,42 @@
|
||||
version: '3.8'
|
||||
services:
|
||||
whisper:
|
||||
build: ./whisper
|
||||
image: lva_whisper:latest
|
||||
ports:
|
||||
- "8001:8001"
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./whisper/models:/models
|
||||
|
||||
coquitts:
|
||||
build: ./coquitts
|
||||
image: lva_coquitts:latest
|
||||
ports:
|
||||
- "8002:8002"
|
||||
restart: unless-stopped
|
||||
|
||||
ollama:
|
||||
# This is a placeholder image; ensure you have an Ollama-compatible image and models available.
|
||||
image: ollama/ollama:latest
|
||||
container_name: ollama
|
||||
ports:
|
||||
- "11434:11434"
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./ollama-data:/root/.ollama
|
||||
|
||||
middleware:
|
||||
build: ./middleware
|
||||
image: lva_middleware:latest
|
||||
ports:
|
||||
- "8000:8000"
|
||||
depends_on:
|
||||
- whisper
|
||||
- coquitts
|
||||
- ollama
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: lva_network
|
||||
@@ -0,0 +1,15 @@
|
||||
FROM python:3.11-slim
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
libsndfile1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt ./
|
||||
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY server.py ./
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
@@ -0,0 +1,4 @@
|
||||
fastapi==0.100.0
|
||||
uvicorn[standard]==0.22.0
|
||||
httpx==0.24.1
|
||||
python-multipart==0.0.6
|
||||
@@ -0,0 +1,56 @@
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
import httpx
|
||||
import tempfile
|
||||
import shutil
|
||||
import asyncio
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
WHISPER_URL = "http://whisper:8001/transcribe"
|
||||
COQUITTS_URL = "http://coquitts:8002/speak"
|
||||
OLLAMA_URL = "http://ollama:11434/v1/complete"
|
||||
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(file: UploadFile = File(...)):
|
||||
if not file.content_type.startswith("audio"):
|
||||
raise HTTPException(status_code=400, detail="File must be audio")
|
||||
|
||||
# save file to temp
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
||||
contents = await file.read()
|
||||
tmp.write(contents)
|
||||
tmp.flush()
|
||||
tmp_path = tmp.name
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Send audio to whisper
|
||||
with open(tmp_path, "rb") as f:
|
||||
files = {"file": ("audio.wav", f, "audio/wav")}
|
||||
r = await client.post(WHISPER_URL, files=files, timeout=120.0)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}")
|
||||
|
||||
text = r.json().get("text", "")
|
||||
|
||||
# Send text to ollama for reasoning
|
||||
# We assume Ollama HTTP API accepts JSON {"model":"<model>", "prompt":"..."}
|
||||
ollama_payload = {"model": "llama2", "prompt": text}
|
||||
ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0)
|
||||
if ro.status_code != 200:
|
||||
raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}")
|
||||
|
||||
answer_json = ro.json()
|
||||
# Depending on API shape, try to extract text
|
||||
answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json)
|
||||
|
||||
# Send answer to coquitts to generate German audio
|
||||
coquitts_payload = {"text": answer_text, "language": "de"}
|
||||
co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0)
|
||||
if co.status_code != 200:
|
||||
raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}")
|
||||
|
||||
# stream the audio back
|
||||
return StreamingResponse(co.aiter_bytes(), media_type="audio/wav")
|
||||
@@ -0,0 +1,19 @@
|
||||
FROM python:3.11-slim
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies for audio handling and build tools
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
libsndfile1 \
|
||||
build-essential \
|
||||
git \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt ./
|
||||
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY server.py ./
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
|
||||
@@ -0,0 +1,6 @@
|
||||
fastapi==0.100.0
|
||||
uvicorn[standard]==0.22.0
|
||||
whisper==20230314
|
||||
pydub==0.25.1
|
||||
aiofiles==23.1.0
|
||||
python-multipart==0.0.6
|
||||
@@ -0,0 +1,32 @@
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
import whisper
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
model = whisper.load_model("small")
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(file: UploadFile = File(...)):
|
||||
if not file.content_type.startswith("audio"):
|
||||
raise HTTPException(status_code=400, detail="File must be audio")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
||||
contents = await file.read()
|
||||
tmp.write(contents)
|
||||
tmp.flush()
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = model.transcribe(tmp_path, language=None)
|
||||
text = result.get("text", "")
|
||||
finally:
|
||||
try:
|
||||
shutil.os.remove(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return JSONResponse({"text": text})
|
||||
Reference in New Issue
Block a user