Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e0dec05885 | ||
|
|
8662aed565 | ||
|
|
cdfa1ac5b2 |
141
backend/internal/backend/handlers_split.go
Normal file
141
backend/internal/backend/handlers_split.go
Normal file
@@ -0,0 +1,141 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// handleAdminSplitChapters handles POST /api/admin/books/{slug}/split-chapters.
|
||||
//
|
||||
// Request body (JSON):
|
||||
//
|
||||
// { "text": "<full text with --- dividers and optional ## Title lines>" }
|
||||
//
|
||||
// The text is split on lines containing only "---". Each segment may start with
|
||||
// a "## Title" line which becomes the chapter title; remaining lines are the
|
||||
// chapter content. Sequential chapter numbers 1..N are assigned.
|
||||
//
|
||||
// All existing chapters for the book are replaced: WriteChapter is called for
|
||||
// each new chapter (upsert by number), so chapters beyond N are not deleted —
|
||||
// use the dedup endpoint afterwards if needed.
|
||||
func (s *Server) handleAdminSplitChapters(w http.ResponseWriter, r *http.Request) {
|
||||
if s.deps.BookWriter == nil {
|
||||
jsonError(w, http.StatusServiceUnavailable, "book writer not configured")
|
||||
return
|
||||
}
|
||||
|
||||
slug := r.PathValue("slug")
|
||||
if slug == "" {
|
||||
jsonError(w, http.StatusBadRequest, "slug is required")
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
jsonError(w, http.StatusBadRequest, "parse body: "+err.Error())
|
||||
return
|
||||
}
|
||||
if strings.TrimSpace(req.Text) == "" {
|
||||
jsonError(w, http.StatusBadRequest, "text is required")
|
||||
return
|
||||
}
|
||||
|
||||
chapters := splitChapterText(req.Text)
|
||||
if len(chapters) == 0 {
|
||||
jsonError(w, http.StatusUnprocessableEntity, "no chapters produced from text")
|
||||
return
|
||||
}
|
||||
|
||||
for _, ch := range chapters {
|
||||
var mdContent string
|
||||
if ch.Title != "" && ch.Title != fmt.Sprintf("Chapter %d", ch.Number) {
|
||||
mdContent = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
|
||||
} else {
|
||||
mdContent = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
|
||||
}
|
||||
domainCh := domain.Chapter{
|
||||
Ref: domain.ChapterRef{Number: ch.Number, Title: ch.Title},
|
||||
Text: mdContent,
|
||||
}
|
||||
if err := s.deps.BookWriter.WriteChapter(r.Context(), slug, domainCh); err != nil {
|
||||
jsonError(w, http.StatusInternalServerError, fmt.Sprintf("write chapter %d: %s", ch.Number, err.Error()))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
writeJSON(w, 0, map[string]any{
|
||||
"chapters": len(chapters),
|
||||
"slug": slug,
|
||||
})
|
||||
}
|
||||
|
||||
// splitChapterText splits text on "---" divider lines into bookstore.Chapter
|
||||
// slices. Each segment may optionally start with a "## Title" header line.
|
||||
func splitChapterText(text string) []bookstore.Chapter {
|
||||
lines := strings.Split(text, "\n")
|
||||
|
||||
// Collect raw segments split on "---" dividers.
|
||||
var segments [][]string
|
||||
cur := []string{}
|
||||
for _, line := range lines {
|
||||
if strings.TrimSpace(line) == "---" {
|
||||
segments = append(segments, cur)
|
||||
cur = []string{}
|
||||
} else {
|
||||
cur = append(cur, line)
|
||||
}
|
||||
}
|
||||
segments = append(segments, cur) // last segment
|
||||
|
||||
var chapters []bookstore.Chapter
|
||||
chNum := 0
|
||||
for _, seg := range segments {
|
||||
// Trim leading/trailing blank lines from the segment.
|
||||
start, end := 0, len(seg)
|
||||
for start < end && strings.TrimSpace(seg[start]) == "" {
|
||||
start++
|
||||
}
|
||||
for end > start && strings.TrimSpace(seg[end-1]) == "" {
|
||||
end--
|
||||
}
|
||||
seg = seg[start:end]
|
||||
if len(seg) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for a "## Title" header on the first line.
|
||||
title := ""
|
||||
contentStart := 0
|
||||
if strings.HasPrefix(strings.TrimSpace(seg[0]), "## ") {
|
||||
title = strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(seg[0]), "## "))
|
||||
contentStart = 1
|
||||
// Skip blank lines after the title.
|
||||
for contentStart < len(seg) && strings.TrimSpace(seg[contentStart]) == "" {
|
||||
contentStart++
|
||||
}
|
||||
}
|
||||
|
||||
content := strings.TrimSpace(strings.Join(seg[contentStart:], "\n"))
|
||||
if content == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
chNum++
|
||||
if title == "" {
|
||||
title = fmt.Sprintf("Chapter %d", chNum)
|
||||
}
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: chNum,
|
||||
Title: title,
|
||||
Content: content,
|
||||
})
|
||||
}
|
||||
return chapters
|
||||
}
|
||||
@@ -247,6 +247,9 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
|
||||
// Admin data repair endpoints
|
||||
mux.HandleFunc("POST /api/admin/dedup-chapters/{slug}", s.handleDedupChapters)
|
||||
|
||||
// Admin chapter split (imported books)
|
||||
mux.HandleFunc("POST /api/admin/books/{slug}/split-chapters", s.handleAdminSplitChapters)
|
||||
|
||||
// Import (PDF/EPUB)
|
||||
mux.HandleFunc("POST /api/admin/import", s.handleAdminImport)
|
||||
mux.HandleFunc("GET /api/admin/import", s.handleAdminImportList)
|
||||
|
||||
@@ -19,3 +19,53 @@ func stripMarkdown(src string) string {
|
||||
src = regexp.MustCompile(`\n{3,}`).ReplaceAllString(src, "\n\n")
|
||||
return strings.TrimSpace(src)
|
||||
}
|
||||
|
||||
// chunkText splits text into chunks of at most maxChars characters, breaking
|
||||
// at sentence boundaries (". ", "! ", "? ", "\n") so that the TTS service
|
||||
// receives natural prose fragments rather than mid-sentence cuts.
|
||||
//
|
||||
// If a single sentence exceeds maxChars it is included as its own chunk —
|
||||
// never silently truncated.
|
||||
func chunkText(text string, maxChars int) []string {
|
||||
if len(text) <= maxChars {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
// Sentence-boundary delimiters — we split AFTER these sequences.
|
||||
// Order matters: longer sequences first.
|
||||
delimiters := []string{".\n", "!\n", "?\n", ". ", "! ", "? ", "\n\n", "\n"}
|
||||
|
||||
var chunks []string
|
||||
remaining := text
|
||||
|
||||
for len(remaining) > 0 {
|
||||
if len(remaining) <= maxChars {
|
||||
chunks = append(chunks, strings.TrimSpace(remaining))
|
||||
break
|
||||
}
|
||||
|
||||
// Find the last sentence boundary within the maxChars window.
|
||||
window := remaining[:maxChars]
|
||||
cutAt := -1
|
||||
for _, delim := range delimiters {
|
||||
idx := strings.LastIndex(window, delim)
|
||||
if idx > 0 && idx+len(delim) > cutAt {
|
||||
cutAt = idx + len(delim)
|
||||
}
|
||||
}
|
||||
|
||||
if cutAt <= 0 {
|
||||
// No boundary found — hard-break at maxChars to avoid infinite loop.
|
||||
cutAt = maxChars
|
||||
}
|
||||
|
||||
chunk := strings.TrimSpace(remaining[:cutAt])
|
||||
if chunk != "" {
|
||||
chunks = append(chunks, chunk)
|
||||
}
|
||||
remaining = strings.TrimSpace(remaining[cutAt:])
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
|
||||
@@ -656,7 +656,7 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
|
||||
return
|
||||
}
|
||||
var genErr error
|
||||
audioData, genErr = r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
audioData, genErr = kokoroGenerateChunked(ctx, r.deps.Kokoro, text, task.Voice, log)
|
||||
if genErr != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", genErr))
|
||||
return
|
||||
@@ -685,6 +685,31 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
|
||||
log.Info("runner: audio task finished", "key", key)
|
||||
}
|
||||
|
||||
// kokoroGenerateChunked splits text into ~1 000-character sentence-boundary
|
||||
// chunks, calls Kokoro.GenerateAudio for each, and concatenates the raw MP3
|
||||
// bytes. This avoids EOF / timeout failures that occur when the Kokoro
|
||||
// FastAPI server receives very large inputs (e.g. a full imported PDF chapter).
|
||||
//
|
||||
// Concatenating raw MP3 frames is valid — MP3 is a frame-based format and
|
||||
// standard players handle multi-segment files correctly.
|
||||
func kokoroGenerateChunked(ctx context.Context, k kokoro.Client, text, voice string, log *slog.Logger) ([]byte, error) {
|
||||
const chunkSize = 1000
|
||||
|
||||
chunks := chunkText(text, chunkSize)
|
||||
log.Info("runner: kokoro chunked generation", "chunks", len(chunks), "total_chars", len(text))
|
||||
|
||||
var combined []byte
|
||||
for i, chunk := range chunks {
|
||||
data, err := k.GenerateAudio(ctx, chunk, voice)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("chunk %d/%d: %w", i+1, len(chunks), err)
|
||||
}
|
||||
combined = append(combined, data...)
|
||||
log.Info("runner: kokoro chunk done", "chunk", i+1, "of", len(chunks), "bytes", len(data))
|
||||
}
|
||||
return combined, nil
|
||||
}
|
||||
|
||||
// runImportTask executes one PDF/EPUB import task.
|
||||
// Preferred path: when task.ChaptersKey is set, it reads pre-parsed chapters
|
||||
// JSON from MinIO (written by the backend at upload time) and ingests them.
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -16,16 +15,10 @@ import (
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
minio "github.com/minio/minio-go/v7"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/api"
|
||||
pdfcpu "github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// chapterHeadingRE matches common chapter heading patterns:
|
||||
// "Chapter 1", "Chapter 1:", "Chapter 1 -", "CHAPTER ONE", "1.", "Part 1", etc.
|
||||
var chapterHeadingRE = regexp.MustCompile(
|
||||
`(?i)^(?:chapter|ch\.?|part|episode|book)\s+(\d+|[ivxlcdm]+)\b|^\d{1,4}[\.\)]\s+\S`)
|
||||
|
||||
type importer struct {
|
||||
mc *minioClient
|
||||
}
|
||||
@@ -148,17 +141,16 @@ var pdfSkipBookmarks = map[string]bool{
|
||||
"appendix": true, "color insert": true, "color illustrations": true,
|
||||
}
|
||||
|
||||
// parsePDF extracts chapters from PDF bytes.
|
||||
// parsePDF extracts text from PDF bytes and returns it as a single chapter.
|
||||
//
|
||||
// The full readable text is returned as one chapter so the admin can manually
|
||||
// split it into chapters via the UI using --- markers.
|
||||
//
|
||||
// Strategy:
|
||||
// 1. Decrypt owner-protected PDFs (empty user password).
|
||||
// 2. Read the PDF outline (bookmarks) — these give chapter titles and page ranges.
|
||||
// 3. Extract raw content streams for every page using pdfcpu ExtractContent.
|
||||
// 4. For each story bookmark, concatenate the extracted text of its pages.
|
||||
//
|
||||
// Falls back to paragraph-splitting when no bookmarks are found.
|
||||
// This is fast (~100ms for a 250-page PDF) because it avoids font-glyph
|
||||
// resolution which causes older PDF libraries to hang on publisher PDFs.
|
||||
// 2. Extract raw content streams for every page using pdfcpu ExtractContent.
|
||||
// 3. Concatenate text from all pages in order, skipping front matter
|
||||
// (cover, title page, copyright — typically the first 10 pages).
|
||||
func parsePDF(data []byte) ([]bookstore.Chapter, error) {
|
||||
// Decrypt owner-protected PDFs (empty user password).
|
||||
decrypted, err := decryptPDF(data)
|
||||
@@ -186,103 +178,147 @@ func parsePDF(data []byte) ([]bookstore.Chapter, error) {
|
||||
return nil, fmt.Errorf("PDF has no content pages")
|
||||
}
|
||||
|
||||
// Sort entries by filename so index == page number - 1.
|
||||
sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() })
|
||||
|
||||
// Build page-index → extracted text map.
|
||||
// Parse page number from filename and build ordered text map.
|
||||
pageTexts := make(map[int]string, len(entries))
|
||||
for idx, e := range entries {
|
||||
maxPage := 0
|
||||
for _, e := range entries {
|
||||
pageNum := pageNumFromFilename(e.Name())
|
||||
if pageNum <= 0 {
|
||||
continue
|
||||
}
|
||||
raw, readErr := os.ReadFile(tmpDir + "/" + e.Name())
|
||||
if readErr != nil {
|
||||
continue
|
||||
}
|
||||
pageTexts[idx+1] = extractTextFromContentStream(raw)
|
||||
pageTexts[pageNum] = fixWin1252(extractTextFromContentStream(raw))
|
||||
if pageNum > maxPage {
|
||||
maxPage = pageNum
|
||||
}
|
||||
}
|
||||
|
||||
// Try to use bookmarks (outline) for chapter structure.
|
||||
// Determine front-matter cutoff using bookmarks if available,
|
||||
// otherwise skip the first 10 pages (cover/title/copyright).
|
||||
bodyStart := 1
|
||||
bookmarks, bmErr := api.Bookmarks(bytes.NewReader(data), conf)
|
||||
if bmErr == nil && len(bookmarks) > 0 {
|
||||
chapters := chaptersFromBookmarks(bookmarks, pageTexts)
|
||||
if len(chapters) > 0 {
|
||||
return chapters, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: concatenate all page texts and split by heading patterns.
|
||||
var sb strings.Builder
|
||||
for p := 1; p <= len(entries); p++ {
|
||||
sb.WriteString(pageTexts[p])
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
chapters := extractChaptersFromText(sb.String())
|
||||
if len(chapters) == 0 {
|
||||
return nil, fmt.Errorf("could not extract any chapters from PDF")
|
||||
}
|
||||
return chapters, nil
|
||||
}
|
||||
|
||||
// chaptersFromBookmarks builds a chapter list from PDF bookmarks + per-page text.
|
||||
// It flattens the bookmark tree, skips front/back matter entries, and assigns
|
||||
// page ranges so each chapter spans from its own start page to the next
|
||||
// bookmark's start page minus one.
|
||||
func chaptersFromBookmarks(bookmarks []pdfcpu.Bookmark, pageTexts map[int]string) []bookstore.Chapter {
|
||||
// Flatten bookmark tree.
|
||||
var flat []pdfcpu.Bookmark
|
||||
var flatten func([]pdfcpu.Bookmark)
|
||||
flatten = func(bms []pdfcpu.Bookmark) {
|
||||
for _, bm := range bms {
|
||||
flat = append(flat, bm)
|
||||
flatten(bm.Kids)
|
||||
}
|
||||
}
|
||||
flatten(bookmarks)
|
||||
|
||||
// Sort by page number.
|
||||
sort.Slice(flat, func(i, j int) bool { return flat[i].PageFrom < flat[j].PageFrom })
|
||||
|
||||
// Assign PageThru for entries where it's 0 (last bookmark or missing).
|
||||
maxPage := 0
|
||||
for p := range pageTexts {
|
||||
if p > maxPage {
|
||||
maxPage = p
|
||||
}
|
||||
}
|
||||
for i := range flat {
|
||||
if flat[i].PageThru == 0 {
|
||||
if i+1 < len(flat) {
|
||||
flat[i].PageThru = flat[i+1].PageFrom - 1
|
||||
} else {
|
||||
flat[i].PageThru = maxPage
|
||||
if bmErr == nil {
|
||||
for _, bm := range bookmarks {
|
||||
title := strings.ToLower(strings.TrimSpace(bm.Title))
|
||||
if !pdfSkipBookmarks[title] && bm.PageFrom > 0 {
|
||||
// First non-front-matter bookmark — body starts here.
|
||||
bodyStart = bm.PageFrom
|
||||
break
|
||||
}
|
||||
}
|
||||
} else if maxPage > 10 {
|
||||
bodyStart = 11
|
||||
}
|
||||
|
||||
var chapters []bookstore.Chapter
|
||||
chNum := 0
|
||||
for _, bm := range flat {
|
||||
if pdfSkipBookmarks[strings.ToLower(strings.TrimSpace(bm.Title))] {
|
||||
// Concatenate all body pages.
|
||||
var sb strings.Builder
|
||||
for p := bodyStart; p <= maxPage; p++ {
|
||||
t := strings.TrimSpace(pageTexts[p])
|
||||
if t == "" {
|
||||
continue
|
||||
}
|
||||
// Gather text for all pages in this bookmark's range.
|
||||
var sb strings.Builder
|
||||
for p := bm.PageFrom; p <= bm.PageThru; p++ {
|
||||
if t, ok := pageTexts[p]; ok {
|
||||
sb.WriteString(t)
|
||||
sb.WriteByte('\n')
|
||||
sb.WriteString(t)
|
||||
sb.WriteString("\n\n")
|
||||
}
|
||||
|
||||
text := strings.TrimSpace(sb.String())
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("could not extract any text from PDF")
|
||||
}
|
||||
|
||||
return []bookstore.Chapter{{
|
||||
Number: 1,
|
||||
Title: "Full Text",
|
||||
Content: text,
|
||||
}}, nil
|
||||
}
|
||||
|
||||
// pageNumFromFilename extracts the page number from a pdfcpu content-stream
|
||||
// filename like "out_Content_page_42.txt". Returns 0 if not parseable.
|
||||
func pageNumFromFilename(name string) int {
|
||||
// Strip directory prefix and extension.
|
||||
base := name
|
||||
if idx := strings.LastIndex(base, "/"); idx >= 0 {
|
||||
base = base[idx+1:]
|
||||
}
|
||||
if idx := strings.LastIndex(base, "."); idx >= 0 {
|
||||
base = base[:idx]
|
||||
}
|
||||
// Find last "_" and parse the number after it.
|
||||
if idx := strings.LastIndex(base, "_"); idx >= 0 {
|
||||
n, err := strconv.Atoi(base[idx+1:])
|
||||
if err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// win1252ToUnicode maps the Windows-1252 control range 0x80–0x9F to the
|
||||
// Unicode characters they actually represent in that encoding.
|
||||
// Standard Latin-1 maps these bytes to control characters; Win-1252 maps
|
||||
// them to typographic symbols that appear in publisher PDFs.
|
||||
var win1252ToUnicode = map[byte]rune{
|
||||
0x80: '\u20AC', // €
|
||||
0x82: '\u201A', // ‚
|
||||
0x83: '\u0192', // ƒ
|
||||
0x84: '\u201E', // „
|
||||
0x85: '\u2026', // …
|
||||
0x86: '\u2020', // †
|
||||
0x87: '\u2021', // ‡
|
||||
0x88: '\u02C6', // ˆ
|
||||
0x89: '\u2030', // ‰
|
||||
0x8A: '\u0160', // Š
|
||||
0x8B: '\u2039', // ‹
|
||||
0x8C: '\u0152', // Œ
|
||||
0x8E: '\u017D', // Ž
|
||||
0x91: '\u2018', // ' (left single quotation mark)
|
||||
0x92: '\u2019', // ' (right single quotation mark / apostrophe)
|
||||
0x93: '\u201C', // " (left double quotation mark)
|
||||
0x94: '\u201D', // " (right double quotation mark)
|
||||
0x95: '\u2022', // • (bullet)
|
||||
0x96: '\u2013', // – (en dash)
|
||||
0x97: '\u2014', // — (em dash)
|
||||
0x98: '\u02DC', // ˜
|
||||
0x99: '\u2122', // ™
|
||||
0x9A: '\u0161', // š
|
||||
0x9B: '\u203A', // ›
|
||||
0x9C: '\u0153', // œ
|
||||
0x9E: '\u017E', // ž
|
||||
0x9F: '\u0178', // Ÿ
|
||||
}
|
||||
|
||||
// fixWin1252 replaces Windows-1252 specific bytes (0x80–0x9F) in a string
|
||||
// that was decoded as raw Latin-1 bytes with their proper Unicode equivalents.
|
||||
func fixWin1252(s string) string {
|
||||
// Fast path: if no bytes in 0x80–0x9F range, return unchanged.
|
||||
needsFix := false
|
||||
for i := 0; i < len(s); i++ {
|
||||
b := s[i]
|
||||
if b >= 0x80 && b <= 0x9F {
|
||||
needsFix = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !needsFix {
|
||||
return s
|
||||
}
|
||||
var sb strings.Builder
|
||||
sb.Grow(len(s))
|
||||
for i := 0; i < len(s); i++ {
|
||||
b := s[i]
|
||||
if b >= 0x80 && b <= 0x9F {
|
||||
if r, ok := win1252ToUnicode[b]; ok {
|
||||
sb.WriteRune(r)
|
||||
continue
|
||||
}
|
||||
}
|
||||
text := strings.TrimSpace(sb.String())
|
||||
if len(text) < 50 {
|
||||
continue // skip nearly-empty sections
|
||||
}
|
||||
chNum++
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: chNum,
|
||||
Title: bm.Title,
|
||||
Content: text,
|
||||
})
|
||||
sb.WriteByte(b)
|
||||
}
|
||||
return chapters
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// extractTextFromContentStream parses a raw PDF content stream and extracts
|
||||
@@ -476,6 +512,7 @@ func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
|
||||
}
|
||||
|
||||
var chapters []bookstore.Chapter
|
||||
chNum := 0
|
||||
for i, href := range spineFiles {
|
||||
fullPath := opfDir + href
|
||||
content, err := epubFileContent(zr, fullPath)
|
||||
@@ -486,12 +523,14 @@ func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
|
||||
if strings.TrimSpace(text) == "" {
|
||||
continue
|
||||
}
|
||||
chNum++
|
||||
title := titleMap[href]
|
||||
if title == "" {
|
||||
title = fmt.Sprintf("Chapter %d", i+1)
|
||||
title = fmt.Sprintf("Chapter %d", chNum)
|
||||
}
|
||||
_ = i // spine index unused for numbering
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: i + 1,
|
||||
Number: chNum,
|
||||
Title: title,
|
||||
Content: text,
|
||||
})
|
||||
@@ -788,80 +827,6 @@ func htmlToText(data []byte) string {
|
||||
return strings.TrimSpace(strings.Join(out, "\n"))
|
||||
}
|
||||
|
||||
// ── Chapter segmentation (shared by PDF and plain-text paths) ─────────────────
|
||||
|
||||
// extractChaptersFromText splits a block of plain text into chapters by
|
||||
// detecting heading lines that match chapterHeadingRE.
|
||||
// Falls back to paragraph-splitting when no headings are found.
|
||||
func extractChaptersFromText(text string) []bookstore.Chapter {
|
||||
lines := strings.Split(text, "\n")
|
||||
|
||||
type segment struct {
|
||||
title string
|
||||
number int
|
||||
lines []string
|
||||
}
|
||||
|
||||
var segments []segment
|
||||
var cur *segment
|
||||
chNum := 0
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if chapterHeadingRE.MatchString(line) {
|
||||
if cur != nil {
|
||||
segments = append(segments, *cur)
|
||||
}
|
||||
chNum++
|
||||
// Try to parse the explicit chapter number from the heading.
|
||||
if m := regexp.MustCompile(`\d+`).FindString(line); m != "" {
|
||||
if n, err := strconv.Atoi(m); err == nil && n > 0 && n < 100000 {
|
||||
chNum = n
|
||||
}
|
||||
}
|
||||
cur = &segment{title: line, number: chNum}
|
||||
} else if cur != nil && line != "" {
|
||||
cur.lines = append(cur.lines, line)
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
segments = append(segments, *cur)
|
||||
}
|
||||
|
||||
// Require segments to have meaningful content (>= 100 chars).
|
||||
var chapters []bookstore.Chapter
|
||||
for _, seg := range segments {
|
||||
content := strings.Join(seg.lines, "\n")
|
||||
if len(strings.TrimSpace(content)) < 50 {
|
||||
continue
|
||||
}
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: seg.number,
|
||||
Title: seg.title,
|
||||
Content: content,
|
||||
})
|
||||
}
|
||||
|
||||
// Fallback: no headings found — split by double newlines (paragraph blocks).
|
||||
if len(chapters) == 0 {
|
||||
paragraphs := strings.Split(text, "\n\n")
|
||||
n := 0
|
||||
for _, para := range paragraphs {
|
||||
para = strings.TrimSpace(para)
|
||||
if len(para) > 100 {
|
||||
n++
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: n,
|
||||
Title: fmt.Sprintf("Chapter %d", n),
|
||||
Content: para,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return chapters
|
||||
}
|
||||
|
||||
// ── Chapter ingestion ─────────────────────────────────────────────────────────
|
||||
|
||||
// IngestChapters stores extracted chapters for a book.
|
||||
|
||||
@@ -100,6 +100,58 @@
|
||||
const genres = $derived(parseGenres(data.book?.genres ?? []));
|
||||
const chapterList = $derived(data.chapters ?? []);
|
||||
|
||||
// ── Admin: split chapters (imported PDF/EPUB books) ──────────────────────
|
||||
const isFullTextBook = $derived(
|
||||
chapterList.length === 1 && chapterList[0].title === 'Full Text'
|
||||
);
|
||||
let splitText = $state('');
|
||||
let splitSaving = $state(false);
|
||||
let splitResult = $state<'saved' | 'error' | ''>('');
|
||||
let splitError = $state('');
|
||||
let splitOpen = $state(false);
|
||||
|
||||
$effect(() => {
|
||||
// Pre-fill the textarea with chapter 1 content when the panel is opened.
|
||||
if (splitOpen && !splitText && data.book?.slug && isFullTextBook) {
|
||||
fetch(`/api/chapter-markdown/${encodeURIComponent(data.book.slug)}/1`)
|
||||
.then((r) => r.ok ? r.text() : '')
|
||||
.then((t) => {
|
||||
// Strip leading "# Full Text\n\n" header if present.
|
||||
splitText = t.replace(/^# Full Text\n\n/, '').trim();
|
||||
})
|
||||
.catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
async function splitChapters() {
|
||||
const slug = data.book?.slug;
|
||||
if (splitSaving || !slug) return;
|
||||
splitSaving = true;
|
||||
splitResult = '';
|
||||
splitError = '';
|
||||
try {
|
||||
const res = await fetch(`/api/admin/books/${encodeURIComponent(slug)}/split-chapters`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text: splitText })
|
||||
});
|
||||
if (res.ok) {
|
||||
splitResult = 'saved';
|
||||
splitOpen = false;
|
||||
await invalidateAll();
|
||||
} else {
|
||||
const d = await res.json().catch(() => ({}));
|
||||
splitError = (d as any).error ?? 'Unknown error';
|
||||
splitResult = 'error';
|
||||
}
|
||||
} catch (e: any) {
|
||||
splitError = e?.message ?? '';
|
||||
splitResult = 'error';
|
||||
} finally {
|
||||
splitSaving = false;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Admin: rescrape ───────────────────────────────────────────────────────
|
||||
let scraping = $state(false);
|
||||
let scrapeResult = $state<'queued' | 'busy' | 'error' | ''>('');
|
||||
@@ -979,7 +1031,7 @@
|
||||
</a>
|
||||
|
||||
<!-- Admin panel (collapsed by default, admin only) -->
|
||||
{#if data.isAdmin && book.source_url}
|
||||
{#if data.isAdmin}
|
||||
<div>
|
||||
<button
|
||||
onclick={() => (adminOpen = !adminOpen)}
|
||||
@@ -997,6 +1049,62 @@
|
||||
|
||||
{#if adminOpen}
|
||||
<div class="px-4 py-3 border-t border-(--color-border) flex flex-col gap-5">
|
||||
|
||||
<!-- Chapter split tool (only for imported books with single "Full Text" chapter) -->
|
||||
{#if isFullTextBook}
|
||||
<div class="flex flex-col gap-2">
|
||||
<div class="flex items-center justify-between">
|
||||
<p class="text-xs font-medium text-(--color-muted) uppercase tracking-wide">Split Chapters</p>
|
||||
<button
|
||||
onclick={() => { splitOpen = !splitOpen; splitResult = ''; splitError = ''; }}
|
||||
class="text-xs text-(--color-muted) hover:text-(--color-text) transition-colors"
|
||||
>
|
||||
{splitOpen ? 'Hide' : 'Edit'}
|
||||
</button>
|
||||
</div>
|
||||
{#if !splitOpen}
|
||||
<p class="text-xs text-(--color-muted)">
|
||||
This book has a single "Full Text" chapter. Use this tool to split it into chapters.
|
||||
</p>
|
||||
{/if}
|
||||
{#if splitOpen}
|
||||
<p class="text-xs text-(--color-muted)">
|
||||
Insert <code class="bg-(--color-surface-3) px-1 rounded">---</code> on its own line to divide chapters.
|
||||
Optionally start a segment with <code class="bg-(--color-surface-3) px-1 rounded">## Chapter Title</code>.
|
||||
</p>
|
||||
<textarea
|
||||
bind:value={splitText}
|
||||
rows="16"
|
||||
class="w-full px-2 py-1.5 rounded bg-(--color-surface-3) border border-(--color-border) text-(--color-text) text-xs font-mono focus:outline-none focus:border-(--color-brand) resize-y"
|
||||
placeholder="Paste or edit the full text here. Use --- to split chapters."
|
||||
></textarea>
|
||||
<div class="flex items-center gap-3 flex-wrap">
|
||||
<button
|
||||
onclick={splitChapters}
|
||||
disabled={splitSaving || !splitText.trim()}
|
||||
class="flex items-center gap-1.5 px-3 py-1.5 rounded text-xs font-medium transition-colors
|
||||
{splitSaving || !splitText.trim() ? 'bg-(--color-surface-3) text-(--color-muted) cursor-not-allowed' : 'bg-(--color-brand)/20 text-(--color-brand-dim) hover:bg-(--color-brand)/40 border border-(--color-brand)/30'}"
|
||||
>
|
||||
{#if splitSaving}
|
||||
<svg class="w-3 h-3 animate-spin" fill="none" viewBox="0 0 24 24"><circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"/><path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"/></svg>
|
||||
Saving…
|
||||
{:else}
|
||||
Save chapters
|
||||
{/if}
|
||||
</button>
|
||||
{#if splitResult === 'saved'}
|
||||
<span class="text-xs text-green-400">Saved.</span>
|
||||
{:else if splitResult === 'error'}
|
||||
<span class="text-xs text-(--color-danger)">{splitError || 'Error.'}</span>
|
||||
{/if}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
<hr class="border-(--color-border)" />
|
||||
{/if}
|
||||
|
||||
<!-- Rescrape / range-scrape (only for scraped books with a source URL) -->
|
||||
{#if book.source_url}
|
||||
<!-- Rescrape -->
|
||||
<div class="flex items-center gap-3 flex-wrap">
|
||||
<button
|
||||
@@ -1065,6 +1173,7 @@
|
||||
</span>
|
||||
{/if}
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<hr class="border-(--color-border)" />
|
||||
|
||||
|
||||
Reference in New Issue
Block a user