Compare commits

...

1 Commits

Author SHA1 Message Date
root
e0dec05885 fix: chunk large chapter text for Kokoro TTS to prevent EOF on big inputs
All checks were successful
Release / Test backend (push) Successful in 49s
Release / Check ui (push) Successful in 1m48s
Release / Docker (push) Successful in 5m58s
Release / Gitea Release (push) Successful in 34s
Split chapter text into ~1000-char sentence-boundary chunks before sending
to kokoro-fastapi. Each chunk is generated individually and the raw MP3 bytes
are concatenated. This prevents the EOF / timeout failures that occur when
the server receives a very large single request (e.g. a full PDF 'Full Text'
chapter). chunkText() breaks at sentence endings (. ! ? newlines) to preserve
natural speech flow.
2026-04-10 09:24:37 +05:00
2 changed files with 76 additions and 1 deletions

View File

@@ -19,3 +19,53 @@ func stripMarkdown(src string) string {
src = regexp.MustCompile(`\n{3,}`).ReplaceAllString(src, "\n\n")
return strings.TrimSpace(src)
}
// chunkText splits text into chunks of at most maxChars characters, breaking
// at sentence boundaries (". ", "! ", "? ", "\n") so that the TTS service
// receives natural prose fragments rather than mid-sentence cuts.
//
// If a single sentence exceeds maxChars it is included as its own chunk —
// never silently truncated.
func chunkText(text string, maxChars int) []string {
if len(text) <= maxChars {
return []string{text}
}
// Sentence-boundary delimiters — we split AFTER these sequences.
// Order matters: longer sequences first.
delimiters := []string{".\n", "!\n", "?\n", ". ", "! ", "? ", "\n\n", "\n"}
var chunks []string
remaining := text
for len(remaining) > 0 {
if len(remaining) <= maxChars {
chunks = append(chunks, strings.TrimSpace(remaining))
break
}
// Find the last sentence boundary within the maxChars window.
window := remaining[:maxChars]
cutAt := -1
for _, delim := range delimiters {
idx := strings.LastIndex(window, delim)
if idx > 0 && idx+len(delim) > cutAt {
cutAt = idx + len(delim)
}
}
if cutAt <= 0 {
// No boundary found — hard-break at maxChars to avoid infinite loop.
cutAt = maxChars
}
chunk := strings.TrimSpace(remaining[:cutAt])
if chunk != "" {
chunks = append(chunks, chunk)
}
remaining = strings.TrimSpace(remaining[cutAt:])
}
return chunks
}

View File

@@ -656,7 +656,7 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
return
}
var genErr error
audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
audioData, genErr = kokoroGenerateChunked(ctx, r.deps.Kokoro, text, task.Voice, log)
if genErr != nil {
fail(fmt.Sprintf("kokoro generate: %v", genErr))
return
@@ -685,6 +685,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
log.Info("runner: audio task finished", "key", key)
}
// kokoroGenerateChunked splits text into ~1 000-character sentence-boundary
// chunks, calls Kokoro.GenerateAudio for each, and concatenates the raw MP3
// bytes. This avoids EOF / timeout failures that occur when the Kokoro
// FastAPI server receives very large inputs (e.g. a full imported PDF chapter).
//
// Concatenating raw MP3 frames is valid — MP3 is a frame-based format and
// standard players handle multi-segment files correctly.
func kokoroGenerateChunked(ctx context.Context, k kokoro.Client, text, voice string, log *slog.Logger) ([]byte, error) {
const chunkSize = 1000
chunks := chunkText(text, chunkSize)
log.Info("runner: kokoro chunked generation", "chunks", len(chunks), "total_chars", len(text))
var combined []byte
for i, chunk := range chunks {
data, err := k.GenerateAudio(ctx, chunk, voice)
if err != nil {
return nil, fmt.Errorf("chunk %d/%d: %w", i+1, len(chunks), err)
}
combined = append(combined, data...)
log.Info("runner: kokoro chunk done", "chunk", i+1, "of", len(chunks), "bytes", len(data))
}
return combined, nil
}
// runImportTask executes one PDF/EPUB import task.
// Preferred path: when task.ChaptersKey is set, it reads pre-parsed chapters
// JSON from MinIO (written by the backend at upload time) and ingests them.