Compare commits

...

1 Commits

Author SHA1 Message Date
Admin
6559a8c015 fix: split long text into chunks before sending to Cloudflare AI TTS
Some checks failed
Release / Test backend (push) Successful in 41s
Release / Check ui (push) Successful in 1m5s
Release / Docker / caddy (push) Successful in 37s
Release / Docker / backend (push) Has been cancelled
Release / Docker / runner (push) Has been cancelled
Release / Docker / ui (push) Has been cancelled
Release / Gitea Release (push) Has been cancelled
The aura-2-en model enforces a hard 2 000-character limit per request.
Chapters routinely exceed this, producing 413 errors.

GenerateAudio now splits the stripped text into ≤1 800-char chunks at
paragraph → sentence → space → hard-cut boundaries, calls the API once
per chunk, and concatenates the MP3 frames. Callers (runner, streaming
handler) are unchanged. StreamAudioMP3/WAV inherit the fix automatically
since they delegate to GenerateAudio.
2026-04-04 20:45:22 +05:00

View File

@@ -17,6 +17,10 @@
// response. There is no 100-second Cloudflare proxy timeout because we are
// calling the Cloudflare API directly, not routing through a Cloudflare-proxied
// homelab tunnel.
//
// The aura-2-en model enforces a hard 2 000-character limit per request.
// GenerateAudio transparently splits longer texts into sentence-boundary chunks
// and concatenates the resulting MP3 frames.
package cfai
import (
@@ -145,6 +149,8 @@ func New(accountID, apiToken, model string) Client {
}
// GenerateAudio calls the Cloudflare Workers AI TTS endpoint and returns MP3 bytes.
// The aura-2-en model rejects inputs longer than 2 000 characters, so this method
// splits the text into sentence-bounded chunks and concatenates the MP3 responses.
func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]byte, error) {
if text == "" {
return nil, fmt.Errorf("cfai: empty text")
@@ -154,6 +160,20 @@ func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]b
speaker = "luna"
}
chunks := splitText(text, 1800) // stay comfortably under the 2 000-char limit
var combined []byte
for _, chunk := range chunks {
part, err := c.generateChunk(ctx, chunk, speaker)
if err != nil {
return nil, err
}
combined = append(combined, part...)
}
return combined, nil
}
// generateChunk sends a single ≤2 000-character request and returns MP3 bytes.
func (c *httpClient) generateChunk(ctx context.Context, text, speaker string) ([]byte, error) {
body, err := json.Marshal(map[string]any{
"text": text,
"speaker": speaker,
@@ -189,6 +209,87 @@ func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]b
return mp3, nil
}
// splitText splits src into chunks of at most maxChars characters each.
// It tries to break at paragraph boundaries first, then at sentence-ending
// punctuation (. ! ?), and falls back to the nearest space.
func splitText(src string, maxChars int) []string {
if len(src) <= maxChars {
return []string{src}
}
var chunks []string
remaining := src
for len(remaining) > 0 {
if len(remaining) <= maxChars {
chunks = append(chunks, strings.TrimSpace(remaining))
break
}
// Search window: the first maxChars bytes of remaining.
// Use byte length here because the API limit is in bytes/chars for ASCII;
// for safety we operate on rune-aware slices.
window := remaining
if len(window) > maxChars {
// Trim to maxChars runes (not bytes), ensuring we don't split a multi-byte char.
window = runeSlice(remaining, maxChars)
}
cut := -1
// 1. Prefer paragraph break (\n\n or \n).
if i := strings.LastIndex(window, "\n\n"); i > 0 {
cut = i + 2
} else if i := strings.LastIndex(window, "\n"); i > 0 {
cut = i + 1
}
// 2. Fall back to sentence-ending punctuation followed by a space.
if cut < 0 {
for _, punct := range []string{". ", "! ", "? ", ".\n", "!\n", "?\n"} {
if i := strings.LastIndex(window, punct); i > 0 {
candidate := i + len(punct)
if cut < 0 || candidate > cut {
cut = candidate
}
}
}
}
// 3. Last resort: nearest space.
if cut < 0 {
if i := strings.LastIndex(window, " "); i > 0 {
cut = i + 1
}
}
// 4. Hard cut at maxChars runes if no boundary found.
if cut < 0 {
cut = len(window)
}
chunk := strings.TrimSpace(remaining[:cut])
if chunk != "" {
chunks = append(chunks, chunk)
}
remaining = remaining[cut:]
}
return chunks
}
// runeSlice returns the first n runes of s as a string.
func runeSlice(s string, n int) string {
count := 0
for i := range s {
if count == n {
return s[:i]
}
count++
}
return s
}
// StreamAudioMP3 generates audio and wraps the MP3 bytes as an io.ReadCloser.
func (c *httpClient) StreamAudioMP3(ctx context.Context, text, voice string) (io.ReadCloser, error) {
mp3, err := c.GenerateAudio(ctx, text, voice)