fix: chunk large chapter text for Kokoro TTS to prevent EOF on big inputs

Split chapter text into ~1000-char sentence-boundary chunks before sending to kokoro-fastapi. Each chunk is generated individually and the raw MP3 bytes are concatenated. This prevents the EOF / timeout failures that occur when the server receives a very large single request (e.g. a full PDF 'Full Text' chapter). chunkText() breaks at sentence endings (. ! ? newlines) to preserve natural speech flow.
2026-04-10 09:24:37 +05:00
2 changed files with 76 additions and 1 deletions
--- a/backend/internal/runner/helpers.go
+++ b/backend/internal/runner/helpers.go
@@ -19,3 +19,53 @@ func stripMarkdown(src string) string {
 	src = regexp.MustCompile(`\n{3,}`).ReplaceAllString(src, "\n\n")
 	return strings.TrimSpace(src)
 }
+
+// chunkText splits text into chunks of at most maxChars characters, breaking
+// at sentence boundaries (". ", "! ", "? ", "\n") so that the TTS service
+// receives natural prose fragments rather than mid-sentence cuts.
+//
+// If a single sentence exceeds maxChars it is included as its own chunk —
+// never silently truncated.
+func chunkText(text string, maxChars int) []string {
+	if len(text) <= maxChars {
+		return []string{text}
+	}
+
+	// Sentence-boundary delimiters — we split AFTER these sequences.
+	// Order matters: longer sequences first.
+	delimiters := []string{".\n", "!\n", "?\n", ". ", "! ", "? ", "\n\n", "\n"}
+
+	var chunks []string
+	remaining := text
+
+	for len(remaining) > 0 {
+		if len(remaining) <= maxChars {
+			chunks = append(chunks, strings.TrimSpace(remaining))
+			break
+		}
+
+		// Find the last sentence boundary within the maxChars window.
+		window := remaining[:maxChars]
+		cutAt := -1
+		for _, delim := range delimiters {
+			idx := strings.LastIndex(window, delim)
+			if idx > 0 && idx+len(delim) > cutAt {
+				cutAt = idx + len(delim)
+			}
+		}
+
+		if cutAt <= 0 {
+			// No boundary found — hard-break at maxChars to avoid infinite loop.
+			cutAt = maxChars
+		}
+
+		chunk := strings.TrimSpace(remaining[:cutAt])
+		if chunk != "" {
+			chunks = append(chunks, chunk)
+		}
+		remaining = strings.TrimSpace(remaining[cutAt:])
+	}
+
+	return chunks
+}
+
--- a/backend/internal/runner/runner.go
+++ b/backend/internal/runner/runner.go
@@ -656,7 +656,7 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
 			return
 		}
 		var genErr error
-		audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
+		audioData, genErr = kokoroGenerateChunked(ctx, r.deps.Kokoro, text, task.Voice, log)
 		if genErr != nil {
 			fail(fmt.Sprintf("kokoro generate: %v", genErr))
 			return
@@ -685,6 +685,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
 	log.Info("runner: audio task finished", "key", key)
 }

+// kokoroGenerateChunked splits text into ~1 000-character sentence-boundary
+// chunks, calls Kokoro.GenerateAudio for each, and concatenates the raw MP3
+// bytes. This avoids EOF / timeout failures that occur when the Kokoro
+// FastAPI server receives very large inputs (e.g. a full imported PDF chapter).
+//
+// Concatenating raw MP3 frames is valid — MP3 is a frame-based format and
+// standard players handle multi-segment files correctly.
+func kokoroGenerateChunked(ctx context.Context, k kokoro.Client, text, voice string, log *slog.Logger) ([]byte, error) {
+	const chunkSize = 1000
+
+	chunks := chunkText(text, chunkSize)
+	log.Info("runner: kokoro chunked generation", "chunks", len(chunks), "total_chars", len(text))
+
+	var combined []byte
+	for i, chunk := range chunks {
+		data, err := k.GenerateAudio(ctx, chunk, voice)
+		if err != nil {
+			return nil, fmt.Errorf("chunk %d/%d: %w", i+1, len(chunks), err)
+		}
+		combined = append(combined, data...)
+		log.Info("runner: kokoro chunk done", "chunk", i+1, "of", len(chunks), "bytes", len(data))
+	}
+	return combined, nil
+}
+
 // runImportTask executes one PDF/EPUB import task.
 // Preferred path: when task.ChaptersKey is set, it reads pre-parsed chapters
 // JSON from MinIO (written by the backend at upload time) and ingests them.