Compare commits

...

3 Commits

Author SHA1 Message Date
Admin
d2a4edba43 fix: strip paraglide's 'export * as m' after compile in prepare script
Some checks failed
CI / Backend (pull_request) Successful in 54s
CI / UI (pull_request) Successful in 35s
Release / Test backend (push) Successful in 24s
CI / Backend (push) Successful in 1m2s
CI / UI (push) Successful in 1m0s
Release / Check ui (push) Failing after 29s
Release / Docker / ui (push) Has been skipped
Release / Docker / runner (push) Failing after 1m27s
Release / Docker / caddy (push) Successful in 1m31s
Release / Docker / backend (push) Failing after 1m31s
Release / Gitea Release (push) Has been skipped
paraglide-js unconditionally emits 'export * as m from ...' in messages.js
which causes Vite/Rollup SSR to tree-shake all named message imports,
replacing every m.*() call with (void 0)() and crashing every page.
Strip the two offending lines via a Node.js one-liner in the prepare script
so the fix survives every npm ci run in CI.

Also stop tracking messages.js in git since it is always regenerated.
2026-03-30 22:16:02 +05:00
Admin
4e7f8c6266 feat: streaming audio endpoint with MinIO write-through cache
Some checks failed
CI / Backend (push) Failing after 11s
Release / Check ui (push) Failing after 55s
Release / Test backend (push) Successful in 56s
Release / Docker / ui (push) Has been skipped
CI / UI (push) Failing after 1m6s
Release / Docker / caddy (push) Successful in 41s
CI / Backend (pull_request) Successful in 58s
CI / UI (pull_request) Successful in 55s
Release / Docker / runner (push) Failing after 55s
Release / Docker / backend (push) Failing after 1m23s
Release / Gitea Release (push) Has been skipped
Add GET /api/audio-stream/{slug}/{n}?voice= that streams MP3 audio to the
client as TTS generates it, while simultaneously uploading to MinIO. On
subsequent requests the endpoint redirects to the presigned MinIO URL,
skipping generation entirely.

- PocketTTS: StreamAudioMP3 pipes live WAV response body through ffmpeg
  (streaming transcode — no full-buffer wait)
- Kokoro: StreamAudioMP3 uses stream:true mode, returning MP3 frames
  directly without the two-step download-link flow
- AudioStore: PutAudioStream added for multipart MinIO upload from reader
- WriteTimeout bumped 60s → 15min to accommodate full-chapter streams
- X-Accel-Buffering: no header disables Caddy/nginx response buffering
2026-03-30 22:02:36 +05:00
Admin
b0a4cb8b3d fix: remove spurious 'export * as m' from messages.js causing all pages to 500
Some checks failed
CI / Backend (push) Successful in 1m9s
CI / UI (push) Successful in 34s
Release / Test backend (push) Successful in 39s
Release / Docker / caddy (push) Successful in 46s
CI / Backend (pull_request) Successful in 43s
Release / Check ui (push) Successful in 1m21s
CI / UI (pull_request) Successful in 34s
Release / Docker / ui (push) Successful in 2m2s
Release / Docker / backend (push) Failing after 3m8s
Release / Docker / runner (push) Successful in 3m46s
Release / Gitea Release (push) Has been skipped
The paraglide messages.js had an extra 'export * as m from ...' line which
caused Rollup/Vite to tree-shake all actual message function imports in the
SSR bundle. Every m.* call compiled to (void 0)(), crashing every page
server-side with TypeError. Removed the duplicate namespace re-export.
2026-03-30 21:23:37 +05:00
12 changed files with 331 additions and 29 deletions

View File

@@ -15,6 +15,7 @@ package main
import (
"context"
"fmt"
"io"
"log/slog"
"os"
"os/signal"
@@ -195,6 +196,10 @@ func (n *noopKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, erro
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
}
func (n *noopKokoro) StreamAudioMP3(_ context.Context, _, _ string) (io.ReadCloser, error) {
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
}
func (n *noopKokoro) ListVoices(_ context.Context) ([]string, error) {
return nil, nil
}

View File

@@ -12,6 +12,7 @@ package main
import (
"context"
"fmt"
"io"
"log/slog"
"os"
"os/signal"
@@ -222,6 +223,10 @@ func (n *noopKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, erro
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
}
func (n *noopKokoro) StreamAudioMP3(_ context.Context, _, _ string) (io.ReadCloser, error) {
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
}
func (n *noopKokoro) ListVoices(_ context.Context) ([]string, error) {
return nil, nil
}

View File

@@ -8,7 +8,7 @@ package backend
// handleBrowse, handleSearch
// handleGetRanking, handleGetCover
// handleBookPreview, handleChapterText, handleChapterTextPreview, handleChapterMarkdown, handleReindex
// handleAudioGenerate, handleAudioStatus, handleAudioProxy
// handleAudioGenerate, handleAudioStatus, handleAudioProxy, handleAudioStream
// handleVoices
// handlePresignChapter, handlePresignAudio, handlePresignVoiceSample
// handlePresignAvatarUpload, handlePresignAvatar
@@ -703,6 +703,139 @@ func (s *Server) handleAudioProxy(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, presignURL, http.StatusFound)
}
// handleAudioStream handles GET /api/audio-stream/{slug}/{n}.
//
// Fast path: if audio already exists in MinIO, redirects to the presigned URL
// (same as handleAudioProxy) — the client plays from storage immediately.
//
// Slow path (first request): streams MP3 audio directly to the client while
// simultaneously uploading it to MinIO. After the stream completes, any
// pending audio_jobs task for this key is marked done. Subsequent requests hit
// the fast path and skip TTS generation entirely.
//
// Query params: voice (optional, defaults to DefaultVoice)
func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) {
slug := r.PathValue("slug")
n, err := strconv.Atoi(r.PathValue("n"))
if err != nil || n < 1 {
jsonError(w, http.StatusBadRequest, "invalid chapter")
return
}
voice := r.URL.Query().Get("voice")
if voice == "" {
voice = s.cfg.DefaultVoice
}
audioKey := s.deps.AudioStore.AudioObjectKey(slug, n, voice)
// ── Fast path: already in MinIO ───────────────────────────────────────────
if s.deps.AudioStore.AudioExists(r.Context(), audioKey) {
presignURL, err := s.deps.PresignStore.PresignAudio(r.Context(), audioKey, 1*time.Hour)
if err != nil {
s.deps.Log.Error("handleAudioStream: PresignAudio failed", "slug", slug, "n", n, "err", err)
jsonError(w, http.StatusInternalServerError, "presign failed")
return
}
http.Redirect(w, r, presignURL, http.StatusFound)
return
}
// ── Slow path: generate + stream + save ───────────────────────────────────
// Read the chapter text.
raw, err := s.deps.BookReader.ReadChapter(r.Context(), slug, n)
if err != nil {
s.deps.Log.Error("handleAudioStream: ReadChapter failed", "slug", slug, "n", n, "err", err)
jsonError(w, http.StatusNotFound, "chapter not found")
return
}
text := stripMarkdown(raw)
if text == "" {
jsonError(w, http.StatusUnprocessableEntity, "chapter text is empty")
return
}
// Open the TTS stream.
var audioStream io.ReadCloser
if pockettts.IsPocketTTSVoice(voice) {
if s.deps.PocketTTS == nil {
jsonError(w, http.StatusServiceUnavailable, "pocket-tts not configured")
return
}
audioStream, err = s.deps.PocketTTS.StreamAudioMP3(r.Context(), text, voice)
} else {
if s.deps.Kokoro == nil {
jsonError(w, http.StatusServiceUnavailable, "kokoro not configured")
return
}
audioStream, err = s.deps.Kokoro.StreamAudioMP3(r.Context(), text, voice)
}
if err != nil {
s.deps.Log.Error("handleAudioStream: TTS stream failed", "slug", slug, "n", n, "voice", voice, "err", err)
jsonError(w, http.StatusInternalServerError, "tts stream failed")
return
}
defer audioStream.Close()
// Tee: every byte read from audioStream is written to both the HTTP
// response and a pipe that feeds the MinIO upload goroutine.
pr, pw := io.Pipe()
// MinIO upload runs concurrently. Size -1 triggers multipart upload.
uploadDone := make(chan error, 1)
go func() {
uploadDone <- s.deps.AudioStore.PutAudioStream(
context.Background(), // use background — request ctx may cancel after client disconnects
audioKey, pr, -1, "audio/mpeg",
)
}()
w.Header().Set("Content-Type", "audio/mpeg")
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("X-Accel-Buffering", "no") // disable nginx/caddy buffering
w.WriteHeader(http.StatusOK)
flusher, canFlush := w.(http.Flusher)
tee := io.TeeReader(audioStream, pw)
buf := make([]byte, 32*1024)
for {
nr, readErr := tee.Read(buf)
if nr > 0 {
if _, writeErr := w.Write(buf[:nr]); writeErr != nil {
// Client disconnected — abort upload pipe so goroutine exits.
pw.CloseWithError(writeErr)
<-uploadDone
return
}
if canFlush {
flusher.Flush()
}
}
if readErr != nil {
if readErr == io.EOF {
break
}
s.deps.Log.Warn("handleAudioStream: read error mid-stream", "err", readErr)
pw.CloseWithError(readErr)
<-uploadDone
return
}
}
// Signal end of stream to the MinIO upload goroutine.
pw.Close()
if uploadErr := <-uploadDone; uploadErr != nil {
s.deps.Log.Error("handleAudioStream: MinIO upload failed", "key", audioKey, "err", uploadErr)
// Audio was already streamed to the client — just log; don't error.
// The next request will re-stream since the object is absent.
}
// Note: we do not call FinishAudioTask here — the backend has no Consumer.
// handleAudioStatus fast-paths on AudioExists, so the UI will see "done"
// on its next poll as soon as the MinIO object is present.
}
// ── Translation ────────────────────────────────────────────────────────────────
// supportedTranslationLangs is the set of target locales the backend accepts.

View File

@@ -161,6 +161,9 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
mux.HandleFunc("POST /api/audio/{slug}/{n}", s.handleAudioGenerate)
mux.HandleFunc("GET /api/audio/status/{slug}/{n}", s.handleAudioStatus)
mux.HandleFunc("GET /api/audio-proxy/{slug}/{n}", s.handleAudioProxy)
// Streaming audio: serves from MinIO if cached, else streams live TTS
// while simultaneously uploading to MinIO for future requests.
mux.HandleFunc("GET /api/audio-stream/{slug}/{n}", s.handleAudioStream)
// Translation task creation (backend creates task; runner executes via LibreTranslate)
mux.HandleFunc("POST /api/translation/{slug}/{n}", s.handleTranslationGenerate)
@@ -199,7 +202,7 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
Addr: s.cfg.Addr,
Handler: handler,
ReadTimeout: 15 * time.Second,
WriteTimeout: 60 * time.Second,
WriteTimeout: 15 * time.Minute, // audio-stream can take several minutes for a full chapter
IdleTimeout: 60 * time.Second,
}

View File

@@ -14,6 +14,7 @@ package bookstore
import (
"context"
"io"
"time"
"github.com/libnovel/backend/internal/domain"
@@ -87,6 +88,11 @@ type AudioStore interface {
// PutAudio stores raw audio bytes under the given MinIO object key.
PutAudio(ctx context.Context, key string, data []byte) error
// PutAudioStream uploads audio from r to MinIO under key.
// size must be the exact byte length of r, or -1 to use multipart upload.
// contentType should be "audio/mpeg".
PutAudioStream(ctx context.Context, key string, r io.Reader, size int64, contentType string) error
}
// PresignStore generates short-lived URLs — used exclusively by the backend.

View File

@@ -2,6 +2,7 @@ package bookstore_test
import (
"context"
"io"
"testing"
"time"
@@ -54,6 +55,9 @@ func (m *mockStore) RankingFreshEnough(_ context.Context, _ time.Duration) (bool
func (m *mockStore) AudioObjectKey(_ string, _ int, _ string) string { return "" }
func (m *mockStore) AudioExists(_ context.Context, _ string) bool { return false }
func (m *mockStore) PutAudio(_ context.Context, _ string, _ []byte) error { return nil }
func (m *mockStore) PutAudioStream(_ context.Context, _ string, _ io.Reader, _ int64, _ string) error {
return nil
}
// PresignStore
func (m *mockStore) PresignChapter(_ context.Context, _ string, _ int, _ time.Duration) (string, error) {

View File

@@ -21,6 +21,12 @@ type Client interface {
// GenerateAudio synthesises text using voice and returns raw MP3 bytes.
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
// StreamAudioMP3 synthesises text and returns an io.ReadCloser that streams
// MP3-encoded audio incrementally. Uses the kokoro-fastapi streaming mode
// (stream:true), which delivers MP3 frames as they are generated without
// waiting for the full output. The caller must always close the ReadCloser.
StreamAudioMP3(ctx context.Context, text, voice string) (io.ReadCloser, error)
// ListVoices returns the available voice IDs. Falls back to an empty slice
// on error — callers should treat an empty list as "service unavailable".
ListVoices(ctx context.Context) ([]string, error)
@@ -118,6 +124,49 @@ func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]b
return data, nil
}
// StreamAudioMP3 calls POST /v1/audio/speech with stream:true and returns an
// io.ReadCloser that delivers MP3 frames as kokoro generates them.
// kokoro-fastapi emits raw MP3 bytes when stream mode is enabled — no download
// redirect; the response body IS the audio stream.
func (c *httpClient) StreamAudioMP3(ctx context.Context, text, voice string) (io.ReadCloser, error) {
if text == "" {
return nil, fmt.Errorf("kokoro: empty text")
}
if voice == "" {
voice = "af_bella"
}
reqBody, err := json.Marshal(map[string]any{
"model": "kokoro",
"input": text,
"voice": voice,
"response_format": "mp3",
"speed": 1.0,
"stream": true,
})
if err != nil {
return nil, fmt.Errorf("kokoro: marshal stream request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
c.baseURL+"/v1/audio/speech", bytes.NewReader(reqBody))
if err != nil {
return nil, fmt.Errorf("kokoro: build stream request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.http.Do(req)
if err != nil {
return nil, fmt.Errorf("kokoro: stream request: %w", err)
}
if resp.StatusCode != http.StatusOK {
_, _ = io.Copy(io.Discard, resp.Body)
resp.Body.Close()
return nil, fmt.Errorf("kokoro: stream returned %d", resp.StatusCode)
}
return resp.Body, nil
}
// ListVoices calls GET /v1/audio/voices and returns the list of voice IDs.
func (c *httpClient) ListVoices(ctx context.Context) ([]string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet,

View File

@@ -9,6 +9,10 @@
// so callers receive MP3 bytes — the same format as the kokoro client — and the
// rest of the pipeline does not need to care which TTS engine was used.
//
// StreamAudioMP3 is the streaming variant: it returns an io.ReadCloser that
// yields MP3-encoded audio incrementally as pocket-tts generates it, without
// buffering the full output.
//
// Predefined voices (pass the bare name as the voice parameter):
//
// alba, marius, javert, jean, fantine, cosette, eponine, azelma,
@@ -50,6 +54,11 @@ type Client interface {
// Voice must be one of the predefined pocket-tts voice names.
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
// StreamAudioMP3 synthesises text and returns an io.ReadCloser that streams
// MP3-encoded audio incrementally via a live ffmpeg transcode pipe.
// The caller must always close the returned ReadCloser.
StreamAudioMP3(ctx context.Context, text, voice string) (io.ReadCloser, error)
// ListVoices returns the available predefined voice names.
ListVoices(ctx context.Context) ([]string, error)
}
@@ -79,14 +88,97 @@ func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]b
voice = "alba"
}
// ── Build multipart form ──────────────────────────────────────────────────
resp, err := c.postTTS(ctx, text, voice)
if err != nil {
return nil, err
}
defer resp.Body.Close()
wavData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("pockettts: read response body: %w", err)
}
// ── Transcode WAV → MP3 via ffmpeg ────────────────────────────────────────
mp3Data, err := wavToMP3(ctx, wavData)
if err != nil {
return nil, fmt.Errorf("pockettts: transcode to mp3: %w", err)
}
return mp3Data, nil
}
// StreamAudioMP3 posts to POST /tts and returns an io.ReadCloser that delivers
// MP3 bytes as pocket-tts generates WAV frames. ffmpeg runs as a subprocess
// with stdin connected to the live WAV stream and stdout piped to the caller.
// The caller must always close the returned ReadCloser.
func (c *httpClient) StreamAudioMP3(ctx context.Context, text, voice string) (io.ReadCloser, error) {
if text == "" {
return nil, fmt.Errorf("pockettts: empty text")
}
if voice == "" {
voice = "alba"
}
resp, err := c.postTTS(ctx, text, voice)
if err != nil {
return nil, err
}
// Start ffmpeg: read WAV from stdin (the live HTTP body), write MP3 to stdout.
cmd := exec.CommandContext(ctx,
"ffmpeg",
"-hide_banner", "-loglevel", "error",
"-i", "pipe:0", // WAV from stdin
"-f", "mp3", // output format
"-q:a", "2", // VBR ~190 kbps
"pipe:1", // MP3 to stdout
)
cmd.Stdin = resp.Body
pr, pw := io.Pipe()
cmd.Stdout = pw
var stderrBuf bytes.Buffer
cmd.Stderr = &stderrBuf
if err := cmd.Start(); err != nil {
resp.Body.Close()
return nil, fmt.Errorf("pockettts: start ffmpeg: %w", err)
}
// Close the write end of the pipe when ffmpeg exits, propagating any error.
go func() {
waitErr := cmd.Wait()
resp.Body.Close()
if waitErr != nil {
pw.CloseWithError(fmt.Errorf("ffmpeg: %w (stderr: %s)", waitErr, stderrBuf.String()))
} else {
pw.Close()
}
}()
return pr, nil
}
// ListVoices returns the statically known predefined voice names.
// pocket-tts has no REST endpoint for listing voices.
func (c *httpClient) ListVoices(_ context.Context) ([]string, error) {
voices := make([]string, 0, len(PredefinedVoices))
for v := range PredefinedVoices {
voices = append(voices, v)
}
return voices, nil
}
// postTTS sends a multipart POST /tts request and returns the raw response.
// The caller is responsible for closing resp.Body.
func (c *httpClient) postTTS(ctx context.Context, text, voice string) (*http.Response, error) {
var body bytes.Buffer
mw := multipart.NewWriter(&body)
if err := mw.WriteField("text", text); err != nil {
return nil, fmt.Errorf("pockettts: write text field: %w", err)
}
// pocket-tts accepts a predefined voice name as voice_url.
if err := mw.WriteField("voice_url", voice); err != nil {
return nil, fmt.Errorf("pockettts: write voice_url field: %w", err)
}
@@ -105,34 +197,12 @@ func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]b
if err != nil {
return nil, fmt.Errorf("pockettts: request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
_, _ = io.Copy(io.Discard, resp.Body)
resp.Body.Close()
return nil, fmt.Errorf("pockettts: server returned %d", resp.StatusCode)
}
wavData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("pockettts: read response body: %w", err)
}
// ── Transcode WAV → MP3 via ffmpeg ────────────────────────────────────────
mp3Data, err := wavToMP3(ctx, wavData)
if err != nil {
return nil, fmt.Errorf("pockettts: transcode to mp3: %w", err)
}
return mp3Data, nil
}
// ListVoices returns the statically known predefined voice names.
// pocket-tts has no REST endpoint for listing voices.
func (c *httpClient) ListVoices(_ context.Context) ([]string, error) {
voices := make([]string, 0, len(PredefinedVoices))
for v := range PredefinedVoices {
voices = append(voices, v)
}
return voices, nil
return resp, nil
}
// wavToMP3 converts raw WAV bytes to MP3 using ffmpeg.

View File

@@ -1,8 +1,10 @@
package runner_test
import (
"bytes"
"context"
"errors"
"io"
"sync/atomic"
"testing"
"time"
@@ -129,6 +131,10 @@ func (s *stubAudioStore) PutAudio(_ context.Context, _ string, _ []byte) error {
s.putCalled.Add(1)
return s.putErr
}
func (s *stubAudioStore) PutAudioStream(_ context.Context, _ string, _ io.Reader, _ int64, _ string) error {
s.putCalled.Add(1)
return s.putErr
}
// stubNovelScraper satisfies scraper.NovelScraper minimally.
type stubNovelScraper struct {
@@ -185,6 +191,14 @@ func (s *stubKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, erro
return s.data, s.genErr
}
func (s *stubKokoro) StreamAudioMP3(_ context.Context, _, _ string) (io.ReadCloser, error) {
s.called.Add(1)
if s.genErr != nil {
return nil, s.genErr
}
return io.NopCloser(bytes.NewReader(s.data)), nil
}
func (s *stubKokoro) ListVoices(_ context.Context) ([]string, error) {
return []string{"af_bella"}, nil
}

View File

@@ -155,6 +155,14 @@ func (m *minioClient) putObject(ctx context.Context, bucket, key, contentType st
return err
}
// putObjectStream uploads from r with known size (or -1 for multipart).
func (m *minioClient) putObjectStream(ctx context.Context, bucket, key, contentType string, r io.Reader, size int64) error {
_, err := m.client.PutObject(ctx, bucket, key, r, size,
minio.PutObjectOptions{ContentType: contentType},
)
return err
}
func (m *minioClient) getObject(ctx context.Context, bucket, key string) ([]byte, error) {
obj, err := m.client.GetObject(ctx, bucket, key, minio.GetObjectOptions{})
if err != nil {

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"strings"
"time"
@@ -383,6 +384,10 @@ func (s *Store) PutAudio(ctx context.Context, key string, data []byte) error {
return s.mc.putObject(ctx, s.mc.bucketAudio, key, "audio/mpeg", data)
}
func (s *Store) PutAudioStream(ctx context.Context, key string, r io.Reader, size int64, contentType string) error {
return s.mc.putObjectStream(ctx, s.mc.bucketAudio, key, contentType, r, size)
}
// ── PresignStore ──────────────────────────────────────────────────────────────
func (s *Store) PresignChapter(ctx context.Context, slug string, n int, expires time.Duration) (string, error) {

View File

@@ -7,7 +7,7 @@
"dev": "vite dev",
"build": "vite build",
"preview": "vite preview",
"prepare": "paraglide-js compile --project ./project.inlang --outdir ./src/lib/paraglide && svelte-kit sync || echo ''",
"prepare": "paraglide-js compile --project ./project.inlang --outdir ./src/lib/paraglide && node -e \"const fs=require('fs'),f='./src/lib/paraglide/messages.js',c=fs.readFileSync(f,'utf8').split('\\n').filter(l=>!l.includes('export * as m')&&!l.includes('enabling auto-import')).join('\\n');fs.writeFileSync(f,c)\" && svelte-kit sync || echo ''",
"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch"
},