Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fea09e3e23 |
@@ -36,7 +36,12 @@ COPY --from=builder /out/backend /backend
|
||||
ENTRYPOINT ["/backend"]
|
||||
|
||||
# ── runner service ───────────────────────────────────────────────────────────
|
||||
FROM gcr.io/distroless/static:nonroot AS runner
|
||||
# Uses Alpine (not distroless) so ffmpeg is available for WAV→MP3 transcoding
|
||||
# when pocket-tts voices are used.
|
||||
FROM alpine:3.21 AS runner
|
||||
RUN apk add --no-cache ffmpeg ca-certificates && \
|
||||
addgroup -S appgroup && adduser -S appuser -G appgroup
|
||||
COPY --from=builder /out/healthcheck /healthcheck
|
||||
COPY --from=builder /out/runner /runner
|
||||
USER appuser
|
||||
ENTRYPOINT ["/runner"]
|
||||
|
||||
@@ -26,6 +26,7 @@ import (
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/novelfire"
|
||||
"github.com/libnovel/backend/internal/otelsetup"
|
||||
"github.com/libnovel/backend/internal/pockettts"
|
||||
"github.com/libnovel/backend/internal/runner"
|
||||
"github.com/libnovel/backend/internal/storage"
|
||||
)
|
||||
@@ -112,10 +113,19 @@ func run() error {
|
||||
kokoroClient = kokoro.New(cfg.Kokoro.URL)
|
||||
log.Info("kokoro TTS enabled", "url", cfg.Kokoro.URL)
|
||||
} else {
|
||||
log.Warn("KOKORO_URL not set — audio tasks will fail")
|
||||
log.Warn("KOKORO_URL not set — kokoro voice tasks will fail")
|
||||
kokoroClient = &noopKokoro{}
|
||||
}
|
||||
|
||||
// ── pocket-tts ──────────────────────────────────────────────────────────
|
||||
var pocketTTSClient pockettts.Client
|
||||
if cfg.PocketTTS.URL != "" {
|
||||
pocketTTSClient = pockettts.New(cfg.PocketTTS.URL)
|
||||
log.Info("pocket-tts enabled", "url", cfg.PocketTTS.URL)
|
||||
} else {
|
||||
log.Warn("POCKET_TTS_URL not set — pocket-tts voice tasks will fail")
|
||||
}
|
||||
|
||||
// ── Meilisearch ─────────────────────────────────────────────────────────
|
||||
var searchIndex meili.Client
|
||||
if cfg.Meilisearch.URL != "" {
|
||||
@@ -151,6 +161,7 @@ func run() error {
|
||||
SearchIndex: searchIndex,
|
||||
Novel: novel,
|
||||
Kokoro: kokoroClient,
|
||||
PocketTTS: pocketTTSClient,
|
||||
Log: log,
|
||||
}
|
||||
r := runner.New(rCfg, deps)
|
||||
|
||||
@@ -50,13 +50,20 @@ type MinIO struct {
|
||||
|
||||
// Kokoro holds connection settings for the Kokoro-FastAPI TTS service.
|
||||
type Kokoro struct {
|
||||
// URL is the base URL of the Kokoro service, e.g. https://kokoro.libnovel.cc
|
||||
// An empty string disables TTS generation.
|
||||
// URL is the base URL of the Kokoro service, e.g. https://tts.libnovel.cc
|
||||
// An empty string disables Kokoro TTS generation.
|
||||
URL string
|
||||
// DefaultVoice is the voice used when none is specified.
|
||||
DefaultVoice string
|
||||
}
|
||||
|
||||
// PocketTTS holds connection settings for the kyutai-labs/pocket-tts service.
|
||||
type PocketTTS struct {
|
||||
// URL is the base URL of the pocket-tts service, e.g. https://pocket-tts.libnovel.cc
|
||||
// An empty string disables pocket-tts generation.
|
||||
URL string
|
||||
}
|
||||
|
||||
// HTTP holds settings for the HTTP server (backend only).
|
||||
type HTTP struct {
|
||||
// Addr is the listen address, e.g. ":8080"
|
||||
@@ -113,6 +120,7 @@ type Config struct {
|
||||
PocketBase PocketBase
|
||||
MinIO MinIO
|
||||
Kokoro Kokoro
|
||||
PocketTTS PocketTTS
|
||||
HTTP HTTP
|
||||
Runner Runner
|
||||
Meilisearch Meilisearch
|
||||
@@ -156,6 +164,10 @@ func Load() Config {
|
||||
DefaultVoice: envOr("KOKORO_VOICE", "af_bella"),
|
||||
},
|
||||
|
||||
PocketTTS: PocketTTS{
|
||||
URL: envOr("POCKET_TTS_URL", ""),
|
||||
},
|
||||
|
||||
HTTP: HTTP{
|
||||
Addr: envOr("BACKEND_HTTP_ADDR", ":8080"),
|
||||
},
|
||||
|
||||
159
backend/internal/pockettts/client.go
Normal file
159
backend/internal/pockettts/client.go
Normal file
@@ -0,0 +1,159 @@
|
||||
// Package pockettts provides a client for the kyutai-labs/pocket-tts TTS service.
|
||||
//
|
||||
// pocket-tts exposes a non-OpenAI API:
|
||||
//
|
||||
// POST /tts (multipart form: text, voice_url) → streaming WAV
|
||||
// GET /health → {"status":"healthy"}
|
||||
//
|
||||
// GenerateAudio streams the WAV response and transcodes it to MP3 using ffmpeg,
|
||||
// so callers receive MP3 bytes — the same format as the kokoro client — and the
|
||||
// rest of the pipeline does not need to care which TTS engine was used.
|
||||
//
|
||||
// Predefined voices (pass the bare name as the voice parameter):
|
||||
//
|
||||
// alba, marius, javert, jean, fantine, cosette, eponine, azelma,
|
||||
// anna, vera, charles, paul, george, mary, jane, michael, eve,
|
||||
// bill_boerst, peter_yearsley, stuart_bell
|
||||
package pockettts
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PredefinedVoices is the set of voice names built into pocket-tts.
|
||||
// The runner uses this to decide which TTS engine to route a task to.
|
||||
var PredefinedVoices = map[string]struct{}{
|
||||
"alba": {}, "marius": {}, "javert": {}, "jean": {},
|
||||
"fantine": {}, "cosette": {}, "eponine": {}, "azelma": {},
|
||||
"anna": {}, "vera": {}, "charles": {}, "paul": {},
|
||||
"george": {}, "mary": {}, "jane": {}, "michael": {},
|
||||
"eve": {}, "bill_boerst": {}, "peter_yearsley": {}, "stuart_bell": {},
|
||||
}
|
||||
|
||||
// IsPocketTTSVoice reports whether voice is served by pocket-tts.
|
||||
func IsPocketTTSVoice(voice string) bool {
|
||||
_, ok := PredefinedVoices[voice]
|
||||
return ok
|
||||
}
|
||||
|
||||
// Client is the interface for interacting with the pocket-tts service.
|
||||
type Client interface {
|
||||
// GenerateAudio synthesises text using the given voice and returns MP3 bytes.
|
||||
// Voice must be one of the predefined pocket-tts voice names.
|
||||
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
|
||||
|
||||
// ListVoices returns the available predefined voice names.
|
||||
ListVoices(ctx context.Context) ([]string, error)
|
||||
}
|
||||
|
||||
// httpClient is the concrete pocket-tts HTTP client.
|
||||
type httpClient struct {
|
||||
baseURL string
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
// New returns a Client targeting baseURL (e.g. "https://pocket-tts.libnovel.cc").
|
||||
func New(baseURL string) Client {
|
||||
return &httpClient{
|
||||
baseURL: strings.TrimRight(baseURL, "/"),
|
||||
http: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
}
|
||||
|
||||
// GenerateAudio posts to POST /tts and transcodes the WAV response to MP3
|
||||
// using the system ffmpeg binary. Requires ffmpeg to be on PATH (available in
|
||||
// the runner Docker image via Alpine's ffmpeg package).
|
||||
func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]byte, error) {
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("pockettts: empty text")
|
||||
}
|
||||
if voice == "" {
|
||||
voice = "alba"
|
||||
}
|
||||
|
||||
// ── Build multipart form ──────────────────────────────────────────────────
|
||||
var body bytes.Buffer
|
||||
mw := multipart.NewWriter(&body)
|
||||
|
||||
if err := mw.WriteField("text", text); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: write text field: %w", err)
|
||||
}
|
||||
// pocket-tts accepts a predefined voice name as voice_url.
|
||||
if err := mw.WriteField("voice_url", voice); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: write voice_url field: %w", err)
|
||||
}
|
||||
if err := mw.Close(); err != nil {
|
||||
return nil, fmt.Errorf("pockettts: close multipart writer: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
c.baseURL+"/tts", &body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", mw.FormDataContentType())
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
return nil, fmt.Errorf("pockettts: server returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
wavData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: read response body: %w", err)
|
||||
}
|
||||
|
||||
// ── Transcode WAV → MP3 via ffmpeg ────────────────────────────────────────
|
||||
mp3Data, err := wavToMP3(ctx, wavData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pockettts: transcode to mp3: %w", err)
|
||||
}
|
||||
return mp3Data, nil
|
||||
}
|
||||
|
||||
// ListVoices returns the statically known predefined voice names.
|
||||
// pocket-tts has no REST endpoint for listing voices.
|
||||
func (c *httpClient) ListVoices(_ context.Context) ([]string, error) {
|
||||
voices := make([]string, 0, len(PredefinedVoices))
|
||||
for v := range PredefinedVoices {
|
||||
voices = append(voices, v)
|
||||
}
|
||||
return voices, nil
|
||||
}
|
||||
|
||||
// wavToMP3 converts raw WAV bytes to MP3 using ffmpeg.
|
||||
// ffmpeg reads from stdin (pipe:0) and writes to stdout (pipe:1).
|
||||
func wavToMP3(ctx context.Context, wav []byte) ([]byte, error) {
|
||||
cmd := exec.CommandContext(ctx,
|
||||
"ffmpeg",
|
||||
"-hide_banner", "-loglevel", "error",
|
||||
"-i", "pipe:0", // read WAV from stdin
|
||||
"-f", "mp3", // output format
|
||||
"-q:a", "2", // VBR quality ~190 kbps
|
||||
"pipe:1", // write MP3 to stdout
|
||||
)
|
||||
cmd.Stdin = bytes.NewReader(wav)
|
||||
|
||||
var out, stderr bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("ffmpeg: %w (stderr: %s)", err, stderr.String())
|
||||
}
|
||||
return out.Bytes(), nil
|
||||
}
|
||||
@@ -31,6 +31,7 @@ import (
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/orchestrator"
|
||||
"github.com/libnovel/backend/internal/pockettts"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
@@ -84,8 +85,11 @@ type Dependencies struct {
|
||||
SearchIndex meili.Client
|
||||
// Novel is the scraper implementation.
|
||||
Novel scraper.NovelScraper
|
||||
// Kokoro is the TTS client.
|
||||
// Kokoro is the Kokoro-FastAPI TTS client (GPU, OpenAI-compatible voices).
|
||||
Kokoro kokoro.Client
|
||||
// PocketTTS is the pocket-tts client (CPU, kyutai voices: alba, marius, etc.).
|
||||
// If nil, pocket-tts voice tasks will fail with a clear error.
|
||||
PocketTTS pockettts.Client
|
||||
// Log is the structured logger.
|
||||
Log *slog.Logger
|
||||
}
|
||||
@@ -448,14 +452,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
|
||||
return
|
||||
}
|
||||
|
||||
if r.deps.Kokoro == nil {
|
||||
fail("kokoro client not configured")
|
||||
return
|
||||
}
|
||||
audioData, err := r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
if err != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", err))
|
||||
return
|
||||
var audioData []byte
|
||||
if pockettts.IsPocketTTSVoice(task.Voice) {
|
||||
if r.deps.PocketTTS == nil {
|
||||
fail("pocket-tts client not configured (POCKET_TTS_URL is empty)")
|
||||
return
|
||||
}
|
||||
var genErr error
|
||||
audioData, genErr = r.deps.PocketTTS.GenerateAudio(ctx, text, task.Voice)
|
||||
if genErr != nil {
|
||||
fail(fmt.Sprintf("pocket-tts generate: %v", genErr))
|
||||
return
|
||||
}
|
||||
log.Info("runner: audio generated via pocket-tts", "voice", task.Voice)
|
||||
} else {
|
||||
if r.deps.Kokoro == nil {
|
||||
fail("kokoro client not configured (KOKORO_URL is empty)")
|
||||
return
|
||||
}
|
||||
var genErr error
|
||||
audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
if genErr != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", genErr))
|
||||
return
|
||||
}
|
||||
log.Info("runner: audio generated via kokoro-fastapi", "voice", task.Voice)
|
||||
}
|
||||
|
||||
key := r.deps.AudioStore.AudioObjectKey(task.Slug, task.Chapter, task.Voice)
|
||||
|
||||
@@ -61,6 +61,8 @@ services:
|
||||
KOKORO_URL: "http://kokoro-fastapi:8880"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE}"
|
||||
|
||||
POCKET_TTS_URL: "http://pocket-tts:8000"
|
||||
|
||||
RUNNER_WORKER_ID: "${RUNNER_WORKER_ID}"
|
||||
RUNNER_POLL_INTERVAL: "${RUNNER_POLL_INTERVAL}"
|
||||
RUNNER_MAX_CONCURRENT_SCRAPE: "${RUNNER_MAX_CONCURRENT_SCRAPE}"
|
||||
|
||||
Reference in New Issue
Block a user