Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c8849c6cd | ||
|
|
b30aa23d64 | ||
|
|
fea09e3e23 |
@@ -36,7 +36,12 @@ COPY --from=builder /out/backend /backend
|
||||
ENTRYPOINT ["/backend"]
|
||||
|
||||
# ── runner service ───────────────────────────────────────────────────────────
|
||||
FROM gcr.io/distroless/static:nonroot AS runner
|
||||
# Uses Alpine (not distroless) so ffmpeg is available for WAV→MP3 transcoding
|
||||
# when pocket-tts voices are used.
|
||||
FROM alpine:3.21 AS runner
|
||||
RUN apk add --no-cache ffmpeg ca-certificates && \
|
||||
addgroup -S appgroup && adduser -S appuser -G appgroup
|
||||
COPY --from=builder /out/healthcheck /healthcheck
|
||||
COPY --from=builder /out/runner /runner
|
||||
USER appuser
|
||||
ENTRYPOINT ["/runner"]
|
||||
|
||||
@@ -26,6 +26,7 @@ import (
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/novelfire"
|
||||
"github.com/libnovel/backend/internal/otelsetup"
|
||||
"github.com/libnovel/backend/internal/pockettts"
|
||||
"github.com/libnovel/backend/internal/runner"
|
||||
"github.com/libnovel/backend/internal/storage"
|
||||
)
|
||||
@@ -112,10 +113,19 @@ func run() error {
|
||||
kokoroClient = kokoro.New(cfg.Kokoro.URL)
|
||||
log.Info("kokoro TTS enabled", "url", cfg.Kokoro.URL)
|
||||
} else {
|
||||
log.Warn("KOKORO_URL not set — audio tasks will fail")
|
||||
log.Warn("KOKORO_URL not set — kokoro voice tasks will fail")
|
||||
kokoroClient = &noopKokoro{}
|
||||
}
|
||||
|
||||
// ── pocket-tts ──────────────────────────────────────────────────────────
|
||||
var pocketTTSClient pockettts.Client
|
||||
if cfg.PocketTTS.URL != "" {
|
||||
pocketTTSClient = pockettts.New(cfg.PocketTTS.URL)
|
||||
log.Info("pocket-tts enabled", "url", cfg.PocketTTS.URL)
|
||||
} else {
|
||||
log.Warn("POCKET_TTS_URL not set — pocket-tts voice tasks will fail")
|
||||
}
|
||||
|
||||
// ── Meilisearch ─────────────────────────────────────────────────────────
|
||||
var searchIndex meili.Client
|
||||
if cfg.Meilisearch.URL != "" {
|
||||
@@ -151,6 +161,7 @@ func run() error {
|
||||
SearchIndex: searchIndex,
|
||||
Novel: novel,
|
||||
Kokoro: kokoroClient,
|
||||
PocketTTS: pocketTTSClient,
|
||||
Log: log,
|
||||
}
|
||||
r := runner.New(rCfg, deps)
|
||||
|
||||
@@ -50,13 +50,20 @@ type MinIO struct {
|
||||
|
||||
// Kokoro holds connection settings for the Kokoro-FastAPI TTS service.
|
||||
type Kokoro struct {
|
||||
// URL is the base URL of the Kokoro service, e.g. https://kokoro.libnovel.cc
|
||||
// An empty string disables TTS generation.
|
||||
// URL is the base URL of the Kokoro service, e.g. https://tts.libnovel.cc
|
||||
// An empty string disables Kokoro TTS generation.
|
||||
URL string
|
||||
// DefaultVoice is the voice used when none is specified.
|
||||
DefaultVoice string
|
||||
}
|
||||
|
||||
// PocketTTS holds connection settings for the kyutai-labs/pocket-tts service.
|
||||
type PocketTTS struct {
|
||||
// URL is the base URL of the pocket-tts service, e.g. https://pocket-tts.libnovel.cc
|
||||
// An empty string disables pocket-tts generation.
|
||||
URL string
|
||||
}
|
||||
|
||||
// HTTP holds settings for the HTTP server (backend only).
|
||||
type HTTP struct {
|
||||
// Addr is the listen address, e.g. ":8080"
|
||||
@@ -113,6 +120,7 @@ type Config struct {
|
||||
PocketBase PocketBase
|
||||
MinIO MinIO
|
||||
Kokoro Kokoro
|
||||
PocketTTS PocketTTS
|
||||
HTTP HTTP
|
||||
Runner Runner
|
||||
Meilisearch Meilisearch
|
||||
@@ -156,6 +164,10 @@ func Load() Config {
|
||||
DefaultVoice: envOr("KOKORO_VOICE", "af_bella"),
|
||||
},
|
||||
|
||||
PocketTTS: PocketTTS{
|
||||
URL: envOr("POCKET_TTS_URL", ""),
|
||||
},
|
||||
|
||||
HTTP: HTTP{
|
||||
Addr: envOr("BACKEND_HTTP_ADDR", ":8080"),
|
||||
},
|
||||
|
||||
@@ -2,7 +2,10 @@
|
||||
//
|
||||
// It reads two environment variables:
|
||||
//
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT — OTLP/HTTP endpoint, e.g. http://otel-collector:4318
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT — OTLP/HTTP endpoint; accepts either a full
|
||||
// URL ("https://otel.example.com") or a bare
|
||||
// host[:port] ("otel-collector:4318").
|
||||
// TLS is used when the value starts with "https://".
|
||||
// OTEL_SERVICE_NAME — service name reported in traces (default: "backend")
|
||||
//
|
||||
// When OTEL_EXPORTER_OTLP_ENDPOINT is empty the function is a no-op: it
|
||||
@@ -21,6 +24,7 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/contrib/bridges/otelslog"
|
||||
@@ -41,11 +45,17 @@ import (
|
||||
// - logger: an slog.Logger bridged to OTel logs (falls back to default when disabled).
|
||||
// - err: non-nil only on SDK initialisation failure.
|
||||
func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Logger, err error) {
|
||||
endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
if endpoint == "" {
|
||||
rawEndpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
if rawEndpoint == "" {
|
||||
return nil, slog.Default(), nil // OTel disabled — not an error
|
||||
}
|
||||
|
||||
// WithEndpoint expects a host[:port] value — no scheme.
|
||||
// Support both "https://otel.example.com" and "otel-collector:4318".
|
||||
useTLS := strings.HasPrefix(rawEndpoint, "https://")
|
||||
endpoint := strings.TrimPrefix(rawEndpoint, "https://")
|
||||
endpoint = strings.TrimPrefix(endpoint, "http://")
|
||||
|
||||
serviceName := os.Getenv("OTEL_SERVICE_NAME")
|
||||
if serviceName == "" {
|
||||
serviceName = "backend"
|
||||
@@ -63,10 +73,11 @@ func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Lo
|
||||
}
|
||||
|
||||
// ── Trace provider ────────────────────────────────────────────────────────
|
||||
traceExp, err := otlptracehttp.New(ctx,
|
||||
otlptracehttp.WithEndpoint(endpoint),
|
||||
otlptracehttp.WithInsecure(), // collector is on the internal Docker network
|
||||
)
|
||||
traceOpts := []otlptracehttp.Option{otlptracehttp.WithEndpoint(endpoint)}
|
||||
if !useTLS {
|
||||
traceOpts = append(traceOpts, otlptracehttp.WithInsecure())
|
||||
}
|
||||
traceExp, err := otlptracehttp.New(ctx, traceOpts...)
|
||||
if err != nil {
|
||||
return nil, slog.Default(), fmt.Errorf("otelsetup: create OTLP trace exporter: %w", err)
|
||||
}
|
||||
@@ -79,10 +90,11 @@ func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Lo
|
||||
otel.SetTracerProvider(tp)
|
||||
|
||||
// ── Log provider ──────────────────────────────────────────────────────────
|
||||
logExp, err := otlploghttp.New(ctx,
|
||||
otlploghttp.WithEndpoint(endpoint),
|
||||
otlploghttp.WithInsecure(),
|
||||
)
|
||||
logOpts := []otlploghttp.Option{otlploghttp.WithEndpoint(endpoint)}
|
||||
if !useTLS {
|
||||
logOpts = append(logOpts, otlploghttp.WithInsecure())
|
||||
}
|
||||
logExp, err := otlploghttp.New(ctx, logOpts...)
|
||||
if err != nil {
|
||||
return nil, slog.Default(), fmt.Errorf("otelsetup: create OTLP log exporter: %w", err)
|
||||
}
|
||||
|
||||
159
backend/internal/pockettts/client.go
Normal file
159
backend/internal/pockettts/client.go
Normal file
@@ -0,0 +1,159 @@
|
||||
// Package pockettts provides a client for the kyutai-labs/pocket-tts TTS service.
|
||||
//
|
||||
// pocket-tts exposes a non-OpenAI API:
|
||||
//
|
||||
// POST /tts (multipart form: text, voice_url) → streaming WAV
|
||||
// GET /health → {"status":"healthy"}
|
||||
//
|
||||
// GenerateAudio streams the WAV response and transcodes it to MP3 using ffmpeg,
|
||||
// so callers receive MP3 bytes — the same format as the kokoro client — and the
|
||||
// rest of the pipeline does not need to care which TTS engine was used.
|
||||
//
|
||||
// Predefined voices (pass the bare name as the voice parameter):
|
||||
//
|
||||
// alba, marius, javert, jean, fantine, cosette, eponine, azelma,
|
||||
// anna, vera, charles, paul, george, mary, jane, michael, eve,
|
||||
// bill_boerst, peter_yearsley, stuart_bell
|
||||
package pockettts
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PredefinedVoices is the set of voice names built into pocket-tts.
|
||||
// The runner uses this to decide which TTS engine to route a task to.
|
||||
var PredefinedVoices = map[string]struct{}{
|
||||
"alba": {}, "marius": {}, "javert": {}, "jean": {},
|
||||
"fantine": {}, "cosette": {}, "eponine": {}, "azelma": {},
|
||||
"anna": {}, "vera": {}, "charles": {}, "paul": {},
|
||||
"george": {}, "mary": {}, "jane": {}, "michael": {},
|
||||
"eve": {}, "bill_boerst": {}, "peter_yearsley": {}, "stuart_bell": {},
|
||||
}
|
||||
|
||||
// IsPocketTTSVoice reports whether voice is served by pocket-tts.
|
||||
func IsPocketTTSVoice(voice string) bool {
|
||||
_, ok := PredefinedVoices[voice]
|
||||
return ok
|
||||
}
|
||||
|
||||
// Client is the interface for interacting with the pocket-tts service.
|
||||
type Client interface {
|
||||
// GenerateAudio synthesises text using the given voice and returns MP3 bytes.
|
||||
// Voice must be one of the predefined pocket-tts voice names.
|
||||
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
|
||||
|
||||
// ListVoices returns the available predefined voice names.
|
||||
ListVoices(ctx context.Context) ([]string, error)
|
||||
}
|
||||
|
||||
// httpClient is the concrete pocket-tts HTTP client.
|
||||
type httpClient struct {
|
||||
baseURL string
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
// New returns a Client targeting baseURL (e.g. "https://pocket-tts.libnovel.cc").
|
||||
func New(baseURL string) Client {
|
||||
return &httpClient{
|
||||
baseURL: strings.TrimRight(baseURL, "/"),
|
||||
http: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
}
|
||||
|
||||
// GenerateAudio posts to POST /tts and transcodes the WAV response to MP3
|
||||
// using the system ffmpeg binary. Requires ffmpeg to be on PATH (available in
|
||||
// the runner Docker image via Alpine's ffmpeg package).
|
||||
func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]byte, error) {
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("pockettts: empty text")
|
||||
}
|
||||
if voice == "" {
|
||||
voice = "alba"
|
||||
}
|
||||
|
||||
// ── Build multipart form ──────────────────────────────────────────────────
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
|
||||
if err := mw.WriteField("text", text); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: write text field: %w", err)
|
||||
}
|
||||
// pocket-tts accepts a predefined voice name as voice_url.
|
||||
if err := mw.WriteField("voice_url", voice); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: write voice_url field: %w", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: close multipart writer: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
c.baseURL+"/tts", &body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
return nil, fmt.Errorf("pockettts: server returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
wavData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: read response body: %w", err)
|
||||
}
|
||||
|
||||
// ── Transcode WAV → MP3 via ffmpeg ────────────────────────────────────────
|
||||
mp3Data, err := wavToMP3(ctx, wavData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: transcode to mp3: %w", err)
|
||||
}
|
||||
return mp3Data, nil
|
||||
}
|
||||
|
||||
// ListVoices returns the statically known predefined voice names.
|
||||
// pocket-tts has no REST endpoint for listing voices.
|
||||
func (c *httpClient) ListVoices(_ context.Context) ([]string, error) {
|
||||
voices := make([]string, 0, len(PredefinedVoices))
|
||||
for v := range PredefinedVoices {
|
||||
voices = append(voices, v)
|
||||
}
|
||||
return voices, nil
|
||||
}
|
||||
|
||||
// wavToMP3 converts raw WAV bytes to MP3 using ffmpeg.
|
||||
// ffmpeg reads from stdin (pipe:0) and writes to stdout (pipe:1).
|
||||
func wavToMP3(ctx context.Context, wav []byte) ([]byte, error) {
|
||||
cmd := exec.CommandContext(ctx,
|
||||
"ffmpeg",
|
||||
"-hide_banner", "-loglevel", "error",
|
||||
"-i", "pipe:0", // read WAV from stdin
|
||||
"-f", "mp3", // output format
|
||||
"-q:a", "2", // VBR quality ~190 kbps
|
||||
"pipe:1", // write MP3 to stdout
|
||||
)
|
||||
cmd.Stdin = bytes.NewReader(wav)
|
||||
|
||||
var out, stderr bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("ffmpeg: %w (stderr: %s)", err, stderr.String())
|
||||
}
|
||||
return out.Bytes(), nil
|
||||
}
|
||||
@@ -31,6 +31,7 @@ import (
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/orchestrator"
|
||||
"github.com/libnovel/backend/internal/pockettts"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
@@ -84,8 +85,11 @@ type Dependencies struct {
|
||||
SearchIndex meili.Client
|
||||
// Novel is the scraper implementation.
|
||||
Novel scraper.NovelScraper
|
||||
// Kokoro is the TTS client.
|
||||
// Kokoro is the Kokoro-FastAPI TTS client (GPU, OpenAI-compatible voices).
|
||||
Kokoro kokoro.Client
|
||||
// PocketTTS is the pocket-tts client (CPU, kyutai voices: alba, marius, etc.).
|
||||
// If nil, pocket-tts voice tasks will fail with a clear error.
|
||||
PocketTTS pockettts.Client
|
||||
// Log is the structured logger.
|
||||
Log *slog.Logger
|
||||
}
|
||||
@@ -448,14 +452,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
|
||||
return
|
||||
}
|
||||
|
||||
if r.deps.Kokoro == nil {
|
||||
fail("kokoro client not configured")
|
||||
return
|
||||
}
|
||||
audioData, err := r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
if err != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", err))
|
||||
return
|
||||
var audioData []byte
|
||||
if pockettts.IsPocketTTSVoice(task.Voice) {
|
||||
if r.deps.PocketTTS == nil {
|
||||
fail("pocket-tts client not configured (POCKET_TTS_URL is empty)")
|
||||
return
|
||||
}
|
||||
var genErr error
|
||||
audioData, genErr = r.deps.PocketTTS.GenerateAudio(ctx, text, task.Voice)
|
||||
if genErr != nil {
|
||||
fail(fmt.Sprintf("pocket-tts generate: %v", genErr))
|
||||
return
|
||||
}
|
||||
log.Info("runner: audio generated via pocket-tts", "voice", task.Voice)
|
||||
} else {
|
||||
if r.deps.Kokoro == nil {
|
||||
fail("kokoro client not configured (KOKORO_URL is empty)")
|
||||
return
|
||||
}
|
||||
var genErr error
|
||||
audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
if genErr != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", genErr))
|
||||
return
|
||||
}
|
||||
log.Info("runner: audio generated via kokoro-fastapi", "voice", task.Voice)
|
||||
}
|
||||
|
||||
key := r.deps.AudioStore.AudioObjectKey(task.Slug, task.Chapter, task.Voice)
|
||||
|
||||
@@ -61,6 +61,8 @@ services:
|
||||
KOKORO_URL: "http://kokoro-fastapi:8880"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE}"
|
||||
|
||||
POCKET_TTS_URL: "http://pocket-tts:8000"
|
||||
|
||||
RUNNER_WORKER_ID: "${RUNNER_WORKER_ID}"
|
||||
RUNNER_POLL_INTERVAL: "${RUNNER_POLL_INTERVAL}"
|
||||
RUNNER_MAX_CONCURRENT_SCRAPE: "${RUNNER_MAX_CONCURRENT_SCRAPE}"
|
||||
@@ -412,21 +414,27 @@ services:
|
||||
|
||||
# ── pocket-tts (CPU TTS) ────────────────────────────────────────────────────
|
||||
# Lightweight CPU-only TTS using kyutai-labs/pocket-tts.
|
||||
# OpenAI-compatible: POST /v1/audio/speech on port 8000.
|
||||
# Voices: alba, marius, javert, jean, fantine, cosette, eponine, azelma.
|
||||
# Image is built locally on homelab from https://github.com/kyutai-labs/pocket-tts
|
||||
# (no prebuilt image published): cd /tmp && git clone --depth=1 https://github.com/kyutai-labs/pocket-tts.git && docker build -t pocket-tts:latest /tmp/pocket-tts
|
||||
# OpenAI-compatible: POST /tts (multipart form) on port 8000.
|
||||
# Voices: alba, marius, javert, jean, fantine, cosette, eponine, azelma, etc.
|
||||
# Not currently used by the runner (runner uses kokoro-fastapi), but available
|
||||
# for experimentation / fallback.
|
||||
pocket-tts:
|
||||
image: ghcr.io/kyutai-labs/pocket-tts:latest
|
||||
image: pocket-tts:latest
|
||||
restart: unless-stopped
|
||||
command: ["uv", "run", "pocket-tts", "serve", "--host", "0.0.0.0"]
|
||||
expose:
|
||||
- "8000"
|
||||
volumes:
|
||||
- pocket_tts_cache:/root/.cache/pocket_tts
|
||||
- hf_cache:/root/.cache/huggingface
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:8000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 60s
|
||||
start_period: 120s
|
||||
|
||||
# ── Watchtower ──────────────────────────────────────────────────────────────
|
||||
# Auto-updates runner image when CI pushes a new tag.
|
||||
@@ -451,3 +459,5 @@ volumes:
|
||||
prometheus_data:
|
||||
loki_data:
|
||||
grafana_data:
|
||||
pocket_tts_cache:
|
||||
hf_cache:
|
||||
|
||||
Reference in New Issue
Block a user