init
This commit is contained in:
@@ -0,0 +1,36 @@
|
|||||||
|
# Local Voice Assistant (Docker Compose)
|
||||||
|
|
||||||
|
This repository contains a minimal multi-container voice assistant composed of:
|
||||||
|
|
||||||
|
- `whisper` - FastAPI service exposing POST /transcribe for speech-to-text using Whisper.
|
||||||
|
- `coquitts` - FastAPI service exposing POST /speak for text-to-speech using Coqui TTS.
|
||||||
|
- `ollama` - Placeholder container running an Ollama-compatible LLM (exposed on port 11434).
|
||||||
|
- `middleware` - FastAPI service exposing POST /chat that orchestrates the above services.
|
||||||
|
|
||||||
|
Quick notes & assumptions
|
||||||
|
- These services are a starting point. Models will be downloaded on first run and may require lots of disk and memory.
|
||||||
|
- `ollama` uses a placeholder public image; you must replace it with your own Ollama setup or run an Ollama server with the desired model.
|
||||||
|
- The Whisper service uses the `whisper` Python package. For better performance consider `faster-whisper` or running Whisper in GPU-enabled base images.
|
||||||
|
- The Coqui TTS service uses the `TTS` package and downloads German models on first run.
|
||||||
|
|
||||||
|
Run locally with Docker Compose
|
||||||
|
|
||||||
|
1. Build and start:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Example request to the middleware:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://localhost:8000/chat" -F "file=@./sample.wav;type=audio/wav" --output response.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
The `response.wav` will contain the German TTS response.
|
||||||
|
|
||||||
|
Next steps / improvements
|
||||||
|
- Add authentication between services.
|
||||||
|
- Add healthchecks and readiness probes.
|
||||||
|
- Add model selection, caching, and GPU support where available.
|
||||||
|
- Replace Ollama placeholder with a validated model name and response parsing.
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ffmpeg \
|
||||||
|
libsndfile1 \
|
||||||
|
build-essential \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt ./
|
||||||
|
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||||
|
pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY server.py ./
|
||||||
|
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8002"]
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
fastapi==0.100.0
|
||||||
|
uvicorn[standard]==0.22.0
|
||||||
|
TTS==0.12.0
|
||||||
|
torch==2.2.0
|
||||||
|
soundfile==0.12.1
|
||||||
|
numpy==1.26.0
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from fastapi import Body
|
||||||
|
from fastapi.responses import FileResponse, JSONResponse
|
||||||
|
from TTS.api import TTS
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Load a German-capable model. Model may be downloaded on first run.
|
||||||
|
tts = TTS(model_name="tts_models/de/thorsten_hsmm")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/speak")
|
||||||
|
def speak(payload: dict = Body(...)):
|
||||||
|
text = payload.get("text")
|
||||||
|
language = payload.get("language", "de")
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing 'text' in body")
|
||||||
|
|
||||||
|
fd, path = tempfile.mkstemp(suffix=".wav")
|
||||||
|
os.close(fd)
|
||||||
|
try:
|
||||||
|
tts.tts_to_file(text=text, speaker=None, language=language, file_path=path)
|
||||||
|
return FileResponse(path, media_type="audio/wav", filename="response.wav")
|
||||||
|
finally:
|
||||||
|
# FileResponse will stream the file; don't remove immediately. Consumer can manage cleanup.
|
||||||
|
pass
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
version: '3.8'
|
||||||
|
services:
|
||||||
|
whisper:
|
||||||
|
build: ./whisper
|
||||||
|
image: lva_whisper:latest
|
||||||
|
ports:
|
||||||
|
- "8001:8001"
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./whisper/models:/models
|
||||||
|
|
||||||
|
coquitts:
|
||||||
|
build: ./coquitts
|
||||||
|
image: lva_coquitts:latest
|
||||||
|
ports:
|
||||||
|
- "8002:8002"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
ollama:
|
||||||
|
# This is a placeholder image; ensure you have an Ollama-compatible image and models available.
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
container_name: ollama
|
||||||
|
ports:
|
||||||
|
- "11434:11434"
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./ollama-data:/root/.ollama
|
||||||
|
|
||||||
|
middleware:
|
||||||
|
build: ./middleware
|
||||||
|
image: lva_middleware:latest
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
depends_on:
|
||||||
|
- whisper
|
||||||
|
- coquitts
|
||||||
|
- ollama
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
name: lva_network
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ffmpeg \
|
||||||
|
libsndfile1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt ./
|
||||||
|
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||||
|
pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY server.py ./
|
||||||
|
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
fastapi==0.100.0
|
||||||
|
uvicorn[standard]==0.22.0
|
||||||
|
httpx==0.24.1
|
||||||
|
python-multipart==0.0.6
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse, JSONResponse
|
||||||
|
import httpx
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
WHISPER_URL = "http://whisper:8001/transcribe"
|
||||||
|
COQUITTS_URL = "http://coquitts:8002/speak"
|
||||||
|
OLLAMA_URL = "http://ollama:11434/v1/complete"
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/chat")
|
||||||
|
async def chat(file: UploadFile = File(...)):
|
||||||
|
if not file.content_type.startswith("audio"):
|
||||||
|
raise HTTPException(status_code=400, detail="File must be audio")
|
||||||
|
|
||||||
|
# save file to temp
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
||||||
|
contents = await file.read()
|
||||||
|
tmp.write(contents)
|
||||||
|
tmp.flush()
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
# Send audio to whisper
|
||||||
|
with open(tmp_path, "rb") as f:
|
||||||
|
files = {"file": ("audio.wav", f, "audio/wav")}
|
||||||
|
r = await client.post(WHISPER_URL, files=files, timeout=120.0)
|
||||||
|
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise HTTPException(status_code=502, detail=f"Whisper error: {r.status_code} {r.text}")
|
||||||
|
|
||||||
|
text = r.json().get("text", "")
|
||||||
|
|
||||||
|
# Send text to ollama for reasoning
|
||||||
|
# We assume Ollama HTTP API accepts JSON {"model":"<model>", "prompt":"..."}
|
||||||
|
ollama_payload = {"model": "llama2", "prompt": text}
|
||||||
|
ro = await client.post(OLLAMA_URL, json=ollama_payload, timeout=120.0)
|
||||||
|
if ro.status_code != 200:
|
||||||
|
raise HTTPException(status_code=502, detail=f"Ollama error: {ro.status_code} {ro.text}")
|
||||||
|
|
||||||
|
answer_json = ro.json()
|
||||||
|
# Depending on API shape, try to extract text
|
||||||
|
answer_text = answer_json.get("response") or answer_json.get("text") or answer_json.get("output") or str(answer_json)
|
||||||
|
|
||||||
|
# Send answer to coquitts to generate German audio
|
||||||
|
coquitts_payload = {"text": answer_text, "language": "de"}
|
||||||
|
co = await client.post(COQUITTS_URL, json=coquitts_payload, timeout=120.0)
|
||||||
|
if co.status_code != 200:
|
||||||
|
raise HTTPException(status_code=502, detail=f"CoquiTTS error: {co.status_code} {co.text}")
|
||||||
|
|
||||||
|
# stream the audio back
|
||||||
|
return StreamingResponse(co.aiter_bytes(), media_type="audio/wav")
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for audio handling and build tools
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ffmpeg \
|
||||||
|
libsndfile1 \
|
||||||
|
build-essential \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt ./
|
||||||
|
RUN python -m pip install --upgrade pip setuptools wheel && \
|
||||||
|
pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY server.py ./
|
||||||
|
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
fastapi==0.100.0
|
||||||
|
uvicorn[standard]==0.22.0
|
||||||
|
whisper==20230314
|
||||||
|
pydub==0.25.1
|
||||||
|
aiofiles==23.1.0
|
||||||
|
python-multipart==0.0.6
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import whisper
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
model = whisper.load_model("small")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/transcribe")
|
||||||
|
async def transcribe(file: UploadFile = File(...)):
|
||||||
|
if not file.content_type.startswith("audio"):
|
||||||
|
raise HTTPException(status_code=400, detail="File must be audio")
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
||||||
|
contents = await file.read()
|
||||||
|
tmp.write(contents)
|
||||||
|
tmp.flush()
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = model.transcribe(tmp_path, language=None)
|
||||||
|
text = result.get("text", "")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
shutil.os.remove(tmp_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return JSONResponse({"text": text})
|
||||||
Reference in New Issue
Block a user