Compare commits

...

3 Commits

Author SHA1 Message Date
Admin
9c8849c6cd fix(otel): accept full https:// URL in OTEL_EXPORTER_OTLP_ENDPOINT
Some checks failed
Release / Check ui (push) Successful in 39s
Release / Docker / caddy (push) Successful in 1m4s
Release / Test backend (push) Successful in 1m38s
Release / Docker / runner (push) Successful in 39s
Release / Docker / backend (push) Successful in 2m12s
Release / Docker / ui (push) Successful in 1m0s
Release / Upload source maps (push) Failing after 5m19s
Release / Gitea Release (push) Has been skipped
WithEndpoint expects host[:port] with no scheme. When Doppler has
https://otel.libnovel.cc the backend was crashing with 'invalid port'.
Now strip the scheme and enable TLS when prefix is https://.
2026-03-27 19:38:50 +05:00
Admin
b30aa23d64 fix(homelab): pocket-tts uses locally-built image, correct start command and volumes
Some checks failed
CI / Check ui (pull_request) Successful in 35s
CI / Test backend (pull_request) Successful in 39s
CI / Docker / ui (pull_request) Successful in 1m27s
CI / Docker / backend (pull_request) Successful in 1m57s
CI / Docker / runner (pull_request) Failing after 35s
CI / Docker / caddy (pull_request) Successful in 6m14s
- pocket-tts has no prebuilt image on ghcr.io — must be built from source on homelab
- Updated image to pocket-tts:latest (local tag), add command with --host 0.0.0.0
- Add model cache volumes (pocket_tts_cache, hf_cache) so model weights survive restarts
- start_period increased to 120s (first startup downloads weights)
2026-03-27 16:29:36 +05:00
Admin
fea09e3e23 feat(tts): add pocket-tts engine alongside kokoro-fastapi
Some checks failed
Release / Test backend (push) Successful in 36s
Release / Check ui (push) Successful in 50s
Release / Docker / caddy (push) Failing after 45s
CI / Test backend (pull_request) Successful in 39s
CI / Check ui (pull_request) Failing after 11s
CI / Docker / ui (pull_request) Has been skipped
CI / Docker / caddy (pull_request) Failing after 11s
Release / Docker / backend (push) Failing after 48s
Release / Docker / runner (push) Failing after 58s
Release / Upload source maps (push) Failing after 39s
CI / Docker / backend (pull_request) Failing after 34s
CI / Docker / runner (pull_request) Failing after 30s
Release / Docker / ui (push) Successful in 2m42s
Release / Gitea Release (push) Has been skipped
- New backend/internal/pockettts package: POST /tts client for
  kyutai-labs/pocket-tts; streams WAV response and transcodes to MP3
  via ffmpeg so the rest of the pipeline stays format-agnostic
- config.PocketTTS struct + POCKET_TTS_URL env var
- runner.Dependencies.PocketTTS field; runAudioTask routes by voice name:
  pockettts.IsPocketTTSVoice (alba, marius, javert, …) → pocket-tts,
  everything else → kokoro-fastapi
- Dockerfile: runner stage switched from distroless/static to alpine:3.21
  with ffmpeg + ca-certificates installed
- homelab compose: runner gets POCKET_TTS_URL=http://pocket-tts:8000
- Doppler prd + prd_homelab: KOKORO_URL=https://tts.libnovel.cc,
  POCKET_TTS_URL=https://pocket-tts.libnovel.cc
2026-03-27 16:17:13 +05:00
7 changed files with 258 additions and 28 deletions

View File

@@ -36,7 +36,12 @@ COPY --from=builder /out/backend /backend
ENTRYPOINT ["/backend"]
# ── runner service ───────────────────────────────────────────────────────────
FROM gcr.io/distroless/static:nonroot AS runner
# Uses Alpine (not distroless) so ffmpeg is available for WAV→MP3 transcoding
# when pocket-tts voices are used.
FROM alpine:3.21 AS runner
RUN apk add --no-cache ffmpeg ca-certificates && \
addgroup -S appgroup && adduser -S appuser -G appgroup
COPY --from=builder /out/healthcheck /healthcheck
COPY --from=builder /out/runner /runner
USER appuser
ENTRYPOINT ["/runner"]

View File

@@ -26,6 +26,7 @@ import (
"github.com/libnovel/backend/internal/meili"
"github.com/libnovel/backend/internal/novelfire"
"github.com/libnovel/backend/internal/otelsetup"
"github.com/libnovel/backend/internal/pockettts"
"github.com/libnovel/backend/internal/runner"
"github.com/libnovel/backend/internal/storage"
)
@@ -112,10 +113,19 @@ func run() error {
kokoroClient = kokoro.New(cfg.Kokoro.URL)
log.Info("kokoro TTS enabled", "url", cfg.Kokoro.URL)
} else {
log.Warn("KOKORO_URL not set — audio tasks will fail")
log.Warn("KOKORO_URL not set — kokoro voice tasks will fail")
kokoroClient = &noopKokoro{}
}
// ── pocket-tts ──────────────────────────────────────────────────────────
var pocketTTSClient pockettts.Client
if cfg.PocketTTS.URL != "" {
pocketTTSClient = pockettts.New(cfg.PocketTTS.URL)
log.Info("pocket-tts enabled", "url", cfg.PocketTTS.URL)
} else {
log.Warn("POCKET_TTS_URL not set — pocket-tts voice tasks will fail")
}
// ── Meilisearch ─────────────────────────────────────────────────────────
var searchIndex meili.Client
if cfg.Meilisearch.URL != "" {
@@ -151,6 +161,7 @@ func run() error {
SearchIndex: searchIndex,
Novel: novel,
Kokoro: kokoroClient,
PocketTTS: pocketTTSClient,
Log: log,
}
r := runner.New(rCfg, deps)

View File

@@ -50,13 +50,20 @@ type MinIO struct {
// Kokoro holds connection settings for the Kokoro-FastAPI TTS service.
type Kokoro struct {
// URL is the base URL of the Kokoro service, e.g. https://kokoro.libnovel.cc
// An empty string disables TTS generation.
// URL is the base URL of the Kokoro service, e.g. https://tts.libnovel.cc
// An empty string disables Kokoro TTS generation.
URL string
// DefaultVoice is the voice used when none is specified.
DefaultVoice string
}
// PocketTTS holds connection settings for the kyutai-labs/pocket-tts service.
type PocketTTS struct {
// URL is the base URL of the pocket-tts service, e.g. https://pocket-tts.libnovel.cc
// An empty string disables pocket-tts generation.
URL string
}
// HTTP holds settings for the HTTP server (backend only).
type HTTP struct {
// Addr is the listen address, e.g. ":8080"
@@ -113,6 +120,7 @@ type Config struct {
PocketBase PocketBase
MinIO MinIO
Kokoro Kokoro
PocketTTS PocketTTS
HTTP HTTP
Runner Runner
Meilisearch Meilisearch
@@ -156,6 +164,10 @@ func Load() Config {
DefaultVoice: envOr("KOKORO_VOICE", "af_bella"),
},
PocketTTS: PocketTTS{
URL: envOr("POCKET_TTS_URL", ""),
},
HTTP: HTTP{
Addr: envOr("BACKEND_HTTP_ADDR", ":8080"),
},

View File

@@ -2,7 +2,10 @@
//
// It reads two environment variables:
//
// OTEL_EXPORTER_OTLP_ENDPOINT — OTLP/HTTP endpoint, e.g. http://otel-collector:4318
// OTEL_EXPORTER_OTLP_ENDPOINT — OTLP/HTTP endpoint; accepts either a full
// URL ("https://otel.example.com") or a bare
// host[:port] ("otel-collector:4318").
// TLS is used when the value starts with "https://".
// OTEL_SERVICE_NAME — service name reported in traces (default: "backend")
//
// When OTEL_EXPORTER_OTLP_ENDPOINT is empty the function is a no-op: it
@@ -21,6 +24,7 @@ import (
"fmt"
"log/slog"
"os"
"strings"
"time"
"go.opentelemetry.io/contrib/bridges/otelslog"
@@ -41,11 +45,17 @@ import (
// - logger: an slog.Logger bridged to OTel logs (falls back to default when disabled).
// - err: non-nil only on SDK initialisation failure.
func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Logger, err error) {
endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if endpoint == "" {
rawEndpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if rawEndpoint == "" {
return nil, slog.Default(), nil // OTel disabled — not an error
}
// WithEndpoint expects a host[:port] value — no scheme.
// Support both "https://otel.example.com" and "otel-collector:4318".
useTLS := strings.HasPrefix(rawEndpoint, "https://")
endpoint := strings.TrimPrefix(rawEndpoint, "https://")
endpoint = strings.TrimPrefix(endpoint, "http://")
serviceName := os.Getenv("OTEL_SERVICE_NAME")
if serviceName == "" {
serviceName = "backend"
@@ -63,10 +73,11 @@ func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Lo
}
// ── Trace provider ────────────────────────────────────────────────────────
traceExp, err := otlptracehttp.New(ctx,
otlptracehttp.WithEndpoint(endpoint),
otlptracehttp.WithInsecure(), // collector is on the internal Docker network
)
traceOpts := []otlptracehttp.Option{otlptracehttp.WithEndpoint(endpoint)}
if !useTLS {
traceOpts = append(traceOpts, otlptracehttp.WithInsecure())
}
traceExp, err := otlptracehttp.New(ctx, traceOpts...)
if err != nil {
return nil, slog.Default(), fmt.Errorf("otelsetup: create OTLP trace exporter: %w", err)
}
@@ -79,10 +90,11 @@ func Init(ctx context.Context, version string) (shutdown func(), logger *slog.Lo
otel.SetTracerProvider(tp)
// ── Log provider ──────────────────────────────────────────────────────────
logExp, err := otlploghttp.New(ctx,
otlploghttp.WithEndpoint(endpoint),
otlploghttp.WithInsecure(),
)
logOpts := []otlploghttp.Option{otlploghttp.WithEndpoint(endpoint)}
if !useTLS {
logOpts = append(logOpts, otlploghttp.WithInsecure())
}
logExp, err := otlploghttp.New(ctx, logOpts...)
if err != nil {
return nil, slog.Default(), fmt.Errorf("otelsetup: create OTLP log exporter: %w", err)
}

View File

@@ -0,0 +1,159 @@
// Package pockettts provides a client for the kyutai-labs/pocket-tts TTS service.
//
// pocket-tts exposes a non-OpenAI API:
//
// POST /tts (multipart form: text, voice_url) → streaming WAV
// GET /health → {"status":"healthy"}
//
// GenerateAudio streams the WAV response and transcodes it to MP3 using ffmpeg,
// so callers receive MP3 bytes — the same format as the kokoro client — and the
// rest of the pipeline does not need to care which TTS engine was used.
//
// Predefined voices (pass the bare name as the voice parameter):
//
// alba, marius, javert, jean, fantine, cosette, eponine, azelma,
// anna, vera, charles, paul, george, mary, jane, michael, eve,
// bill_boerst, peter_yearsley, stuart_bell
package pockettts
import (
"bytes"
"context"
"fmt"
"io"
"mime/multipart"
"net/http"
"os/exec"
"strings"
"time"
)
// PredefinedVoices is the set of voice names built into pocket-tts.
// The runner uses this to decide which TTS engine to route a task to.
var PredefinedVoices = map[string]struct{}{
"alba": {}, "marius": {}, "javert": {}, "jean": {},
"fantine": {}, "cosette": {}, "eponine": {}, "azelma": {},
"anna": {}, "vera": {}, "charles": {}, "paul": {},
"george": {}, "mary": {}, "jane": {}, "michael": {},
"eve": {}, "bill_boerst": {}, "peter_yearsley": {}, "stuart_bell": {},
}
// IsPocketTTSVoice reports whether voice is served by pocket-tts.
func IsPocketTTSVoice(voice string) bool {
_, ok := PredefinedVoices[voice]
return ok
}
// Client is the interface for interacting with the pocket-tts service.
type Client interface {
// GenerateAudio synthesises text using the given voice and returns MP3 bytes.
// Voice must be one of the predefined pocket-tts voice names.
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
// ListVoices returns the available predefined voice names.
ListVoices(ctx context.Context) ([]string, error)
}
// httpClient is the concrete pocket-tts HTTP client.
type httpClient struct {
baseURL string
http *http.Client
}
// New returns a Client targeting baseURL (e.g. "https://pocket-tts.libnovel.cc").
func New(baseURL string) Client {
return &httpClient{
baseURL: strings.TrimRight(baseURL, "/"),
http: &http.Client{Timeout: 10 * time.Minute},
}
}
// GenerateAudio posts to POST /tts and transcodes the WAV response to MP3
// using the system ffmpeg binary. Requires ffmpeg to be on PATH (available in
// the runner Docker image via Alpine's ffmpeg package).
func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]byte, error) {
if text == "" {
return nil, fmt.Errorf("pockettts: empty text")
}
if voice == "" {
voice = "alba"
}
// ── Build multipart form ──────────────────────────────────────────────────
var body bytes.Buffer
mw := multipart.NewWriter(&body)
if err := mw.WriteField("text", text); err != nil {
return nil, fmt.Errorf("pockettts: write text field: %w", err)
}
// pocket-tts accepts a predefined voice name as voice_url.
if err := mw.WriteField("voice_url", voice); err != nil {
return nil, fmt.Errorf("pockettts: write voice_url field: %w", err)
}
if err := mw.Close(); err != nil {
return nil, fmt.Errorf("pockettts: close multipart writer: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
c.baseURL+"/tts", &body)
if err != nil {
return nil, fmt.Errorf("pockettts: build request: %w", err)
}
req.Header.Set("Content-Type", mw.FormDataContentType())
resp, err := c.http.Do(req)
if err != nil {
return nil, fmt.Errorf("pockettts: request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
_, _ = io.Copy(io.Discard, resp.Body)
return nil, fmt.Errorf("pockettts: server returned %d", resp.StatusCode)
}
wavData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("pockettts: read response body: %w", err)
}
// ── Transcode WAV → MP3 via ffmpeg ────────────────────────────────────────
mp3Data, err := wavToMP3(ctx, wavData)
if err != nil {
return nil, fmt.Errorf("pockettts: transcode to mp3: %w", err)
}
return mp3Data, nil
}
// ListVoices returns the statically known predefined voice names.
// pocket-tts has no REST endpoint for listing voices.
func (c *httpClient) ListVoices(_ context.Context) ([]string, error) {
voices := make([]string, 0, len(PredefinedVoices))
for v := range PredefinedVoices {
voices = append(voices, v)
}
return voices, nil
}
// wavToMP3 converts raw WAV bytes to MP3 using ffmpeg.
// ffmpeg reads from stdin (pipe:0) and writes to stdout (pipe:1).
func wavToMP3(ctx context.Context, wav []byte) ([]byte, error) {
cmd := exec.CommandContext(ctx,
"ffmpeg",
"-hide_banner", "-loglevel", "error",
"-i", "pipe:0", // read WAV from stdin
"-f", "mp3", // output format
"-q:a", "2", // VBR quality ~190 kbps
"pipe:1", // write MP3 to stdout
)
cmd.Stdin = bytes.NewReader(wav)
var out, stderr bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("ffmpeg: %w (stderr: %s)", err, stderr.String())
}
return out.Bytes(), nil
}

View File

@@ -31,6 +31,7 @@ import (
"github.com/libnovel/backend/internal/kokoro"
"github.com/libnovel/backend/internal/meili"
"github.com/libnovel/backend/internal/orchestrator"
"github.com/libnovel/backend/internal/pockettts"
"github.com/libnovel/backend/internal/scraper"
"github.com/libnovel/backend/internal/taskqueue"
)
@@ -84,8 +85,11 @@ type Dependencies struct {
SearchIndex meili.Client
// Novel is the scraper implementation.
Novel scraper.NovelScraper
// Kokoro is the TTS client.
// Kokoro is the Kokoro-FastAPI TTS client (GPU, OpenAI-compatible voices).
Kokoro kokoro.Client
// PocketTTS is the pocket-tts client (CPU, kyutai voices: alba, marius, etc.).
// If nil, pocket-tts voice tasks will fail with a clear error.
PocketTTS pockettts.Client
// Log is the structured logger.
Log *slog.Logger
}
@@ -448,14 +452,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
return
}
if r.deps.Kokoro == nil {
fail("kokoro client not configured")
return
}
audioData, err := r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
if err != nil {
fail(fmt.Sprintf("kokoro generate: %v", err))
return
var audioData []byte
if pockettts.IsPocketTTSVoice(task.Voice) {
if r.deps.PocketTTS == nil {
fail("pocket-tts client not configured (POCKET_TTS_URL is empty)")
return
}
var genErr error
audioData, genErr = r.deps.PocketTTS.GenerateAudio(ctx, text, task.Voice)
if genErr != nil {
fail(fmt.Sprintf("pocket-tts generate: %v", genErr))
return
}
log.Info("runner: audio generated via pocket-tts", "voice", task.Voice)
} else {
if r.deps.Kokoro == nil {
fail("kokoro client not configured (KOKORO_URL is empty)")
return
}
var genErr error
audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
if genErr != nil {
fail(fmt.Sprintf("kokoro generate: %v", genErr))
return
}
log.Info("runner: audio generated via kokoro-fastapi", "voice", task.Voice)
}
key := r.deps.AudioStore.AudioObjectKey(task.Slug, task.Chapter, task.Voice)

View File

@@ -61,6 +61,8 @@ services:
KOKORO_URL: "http://kokoro-fastapi:8880"
KOKORO_VOICE: "${KOKORO_VOICE}"
POCKET_TTS_URL: "http://pocket-tts:8000"
RUNNER_WORKER_ID: "${RUNNER_WORKER_ID}"
RUNNER_POLL_INTERVAL: "${RUNNER_POLL_INTERVAL}"
RUNNER_MAX_CONCURRENT_SCRAPE: "${RUNNER_MAX_CONCURRENT_SCRAPE}"
@@ -412,21 +414,27 @@ services:
# ── pocket-tts (CPU TTS) ────────────────────────────────────────────────────
# Lightweight CPU-only TTS using kyutai-labs/pocket-tts.
# OpenAI-compatible: POST /v1/audio/speech on port 8000.
# Voices: alba, marius, javert, jean, fantine, cosette, eponine, azelma.
# Image is built locally on homelab from https://github.com/kyutai-labs/pocket-tts
# (no prebuilt image published): cd /tmp && git clone --depth=1 https://github.com/kyutai-labs/pocket-tts.git && docker build -t pocket-tts:latest /tmp/pocket-tts
# OpenAI-compatible: POST /tts (multipart form) on port 8000.
# Voices: alba, marius, javert, jean, fantine, cosette, eponine, azelma, etc.
# Not currently used by the runner (runner uses kokoro-fastapi), but available
# for experimentation / fallback.
pocket-tts:
image: ghcr.io/kyutai-labs/pocket-tts:latest
image: pocket-tts:latest
restart: unless-stopped
command: ["uv", "run", "pocket-tts", "serve", "--host", "0.0.0.0"]
expose:
- "8000"
volumes:
- pocket_tts_cache:/root/.cache/pocket_tts
- hf_cache:/root/.cache/huggingface
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
start_period: 120s
# ── Watchtower ──────────────────────────────────────────────────────────────
# Auto-updates runner image when CI pushes a new tag.
@@ -451,3 +459,5 @@ volumes:
prometheus_data:
loki_data:
grafana_data:
pocket_tts_cache:
hf_cache: