Compare commits

...

1 Commits

Author SHA1 Message Date
root
8662aed565 feat: PDF single-chapter import, EPUB numbering fix, admin chapter split tool
All checks were successful
Release / Check ui (push) Successful in 2m10s
Release / Test backend (push) Successful in 53s
Release / Docker (push) Successful in 6m7s
Release / Gitea Release (push) Successful in 23s
- parsePDF: return all text as single 'Full Text' chapter (admin splits manually)
- parseEPUB: fix chapter numbering to use sequential counter not spine index
- Remove dead code: chaptersFromBookmarks, cleanChapterText, extractChaptersFromText, chapterHeadingRE; drop pdfcpu alias and regexp imports
- Backend: POST /api/admin/books/:slug/split-chapters endpoint — splits text on '---' dividers, optional '## Title' headers, writes chapters via WriteChapter
- UI: admin panel now shows for all admin users regardless of source_url; chapter split tool shown when book has single 'Full Text' chapter, pre-fills from MinIO content
2026-04-09 23:59:24 +05:00
4 changed files with 298 additions and 241 deletions

View File

@@ -0,0 +1,141 @@
package backend
import (
"encoding/json"
"fmt"
"net/http"
"strings"
"github.com/libnovel/backend/internal/bookstore"
"github.com/libnovel/backend/internal/domain"
)
// handleAdminSplitChapters handles POST /api/admin/books/{slug}/split-chapters.
//
// Request body (JSON):
//
// { "text": "<full text with --- dividers and optional ## Title lines>" }
//
// The text is split on lines containing only "---". Each segment may start with
// a "## Title" line which becomes the chapter title; remaining lines are the
// chapter content. Sequential chapter numbers 1..N are assigned.
//
// All existing chapters for the book are replaced: WriteChapter is called for
// each new chapter (upsert by number), so chapters beyond N are not deleted —
// use the dedup endpoint afterwards if needed.
func (s *Server) handleAdminSplitChapters(w http.ResponseWriter, r *http.Request) {
if s.deps.BookWriter == nil {
jsonError(w, http.StatusServiceUnavailable, "book writer not configured")
return
}
slug := r.PathValue("slug")
if slug == "" {
jsonError(w, http.StatusBadRequest, "slug is required")
return
}
var req struct {
Text string `json:"text"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
jsonError(w, http.StatusBadRequest, "parse body: "+err.Error())
return
}
if strings.TrimSpace(req.Text) == "" {
jsonError(w, http.StatusBadRequest, "text is required")
return
}
chapters := splitChapterText(req.Text)
if len(chapters) == 0 {
jsonError(w, http.StatusUnprocessableEntity, "no chapters produced from text")
return
}
for _, ch := range chapters {
var mdContent string
if ch.Title != "" && ch.Title != fmt.Sprintf("Chapter %d", ch.Number) {
mdContent = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
} else {
mdContent = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
}
domainCh := domain.Chapter{
Ref: domain.ChapterRef{Number: ch.Number, Title: ch.Title},
Text: mdContent,
}
if err := s.deps.BookWriter.WriteChapter(r.Context(), slug, domainCh); err != nil {
jsonError(w, http.StatusInternalServerError, fmt.Sprintf("write chapter %d: %s", ch.Number, err.Error()))
return
}
}
writeJSON(w, 0, map[string]any{
"chapters": len(chapters),
"slug": slug,
})
}
// splitChapterText splits text on "---" divider lines into bookstore.Chapter
// slices. Each segment may optionally start with a "## Title" header line.
func splitChapterText(text string) []bookstore.Chapter {
lines := strings.Split(text, "\n")
// Collect raw segments split on "---" dividers.
var segments [][]string
cur := []string{}
for _, line := range lines {
if strings.TrimSpace(line) == "---" {
segments = append(segments, cur)
cur = []string{}
} else {
cur = append(cur, line)
}
}
segments = append(segments, cur) // last segment
var chapters []bookstore.Chapter
chNum := 0
for _, seg := range segments {
// Trim leading/trailing blank lines from the segment.
start, end := 0, len(seg)
for start < end && strings.TrimSpace(seg[start]) == "" {
start++
}
for end > start && strings.TrimSpace(seg[end-1]) == "" {
end--
}
seg = seg[start:end]
if len(seg) == 0 {
continue
}
// Check for a "## Title" header on the first line.
title := ""
contentStart := 0
if strings.HasPrefix(strings.TrimSpace(seg[0]), "## ") {
title = strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(seg[0]), "## "))
contentStart = 1
// Skip blank lines after the title.
for contentStart < len(seg) && strings.TrimSpace(seg[contentStart]) == "" {
contentStart++
}
}
content := strings.TrimSpace(strings.Join(seg[contentStart:], "\n"))
if content == "" {
continue
}
chNum++
if title == "" {
title = fmt.Sprintf("Chapter %d", chNum)
}
chapters = append(chapters, bookstore.Chapter{
Number: chNum,
Title: title,
Content: content,
})
}
return chapters
}

View File

@@ -247,6 +247,9 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
// Admin data repair endpoints
mux.HandleFunc("POST /api/admin/dedup-chapters/{slug}", s.handleDedupChapters)
// Admin chapter split (imported books)
mux.HandleFunc("POST /api/admin/books/{slug}/split-chapters", s.handleAdminSplitChapters)
// Import (PDF/EPUB)
mux.HandleFunc("POST /api/admin/import", s.handleAdminImport)
mux.HandleFunc("GET /api/admin/import", s.handleAdminImportList)

View File

@@ -7,7 +7,6 @@ import (
"fmt"
"io"
"os"
"regexp"
"sort"
"strconv"
"strings"
@@ -16,16 +15,10 @@ import (
"github.com/libnovel/backend/internal/domain"
minio "github.com/minio/minio-go/v7"
"github.com/pdfcpu/pdfcpu/pkg/api"
pdfcpu "github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"golang.org/x/net/html"
)
// chapterHeadingRE matches common chapter heading patterns:
// "Chapter 1", "Chapter 1:", "Chapter 1 -", "CHAPTER ONE", "1.", "Part 1", etc.
var chapterHeadingRE = regexp.MustCompile(
`(?i)^(?:chapter|ch\.?|part|episode|book)\s+(\d+|[ivxlcdm]+)\b|^\d{1,4}[\.\)]\s+\S`)
type importer struct {
mc *minioClient
}
@@ -148,17 +141,16 @@ var pdfSkipBookmarks = map[string]bool{
"appendix": true, "color insert": true, "color illustrations": true,
}
// parsePDF extracts chapters from PDF bytes.
// parsePDF extracts text from PDF bytes and returns it as a single chapter.
//
// The full readable text is returned as one chapter so the admin can manually
// split it into chapters via the UI using --- markers.
//
// Strategy:
// 1. Decrypt owner-protected PDFs (empty user password).
// 2. Read the PDF outline (bookmarks) — these give chapter titles and page ranges.
// 3. Extract raw content streams for every page using pdfcpu ExtractContent.
// 4. For each story bookmark, concatenate the extracted text of its pages.
//
// Falls back to paragraph-splitting when no bookmarks are found.
// This is fast (~100ms for a 250-page PDF) because it avoids font-glyph
// resolution which causes older PDF libraries to hang on publisher PDFs.
// 2. Extract raw content streams for every page using pdfcpu ExtractContent.
// 3. Concatenate text from all pages in order, skipping front matter
// (cover, title page, copyright — typically the first 10 pages).
func parsePDF(data []byte) ([]bookstore.Chapter, error) {
// Decrypt owner-protected PDFs (empty user password).
decrypted, err := decryptPDF(data)
@@ -186,9 +178,9 @@ func parsePDF(data []byte) ([]bookstore.Chapter, error) {
return nil, fmt.Errorf("PDF has no content pages")
}
// pdfcpu names files "out_Content_page_N.txt" — parse the page number
// from the filename so the map is correct regardless of lexicographic order.
// Parse page number from filename and build ordered text map.
pageTexts := make(map[int]string, len(entries))
maxPage := 0
for _, e := range entries {
pageNum := pageNumFromFilename(e.Name())
if pageNum <= 0 {
@@ -199,166 +191,49 @@ func parsePDF(data []byte) ([]bookstore.Chapter, error) {
continue
}
pageTexts[pageNum] = fixWin1252(extractTextFromContentStream(raw))
if pageNum > maxPage {
maxPage = pageNum
}
}
// Try to use bookmarks (outline) for chapter structure.
// Determine front-matter cutoff using bookmarks if available,
// otherwise skip the first 10 pages (cover/title/copyright).
bodyStart := 1
bookmarks, bmErr := api.Bookmarks(bytes.NewReader(data), conf)
if bmErr == nil && len(bookmarks) > 0 {
chapters := chaptersFromBookmarks(bookmarks, pageTexts)
if len(chapters) > 0 {
return chapters, nil
if bmErr == nil {
for _, bm := range bookmarks {
title := strings.ToLower(strings.TrimSpace(bm.Title))
if !pdfSkipBookmarks[title] && bm.PageFrom > 0 {
// First non-front-matter bookmark — body starts here.
bodyStart = bm.PageFrom
break
}
}
} else if maxPage > 10 {
bodyStart = 11
}
// Fallback: concatenate all page texts in page order and split by heading patterns.
// Concatenate all body pages.
var sb strings.Builder
maxPage := 0
for p := range pageTexts {
if p > maxPage {
maxPage = p
}
}
for p := 1; p <= maxPage; p++ {
sb.WriteString(pageTexts[p])
sb.WriteByte('\n')
}
chapters := extractChaptersFromText(sb.String())
if len(chapters) == 0 {
return nil, fmt.Errorf("could not extract any chapters from PDF")
}
return chapters, nil
}
// chaptersFromBookmarks builds a chapter list from PDF bookmarks + per-page text.
// It flattens the bookmark tree, skips front/back matter entries, and assigns
// page ranges so each chapter spans from its own start page to the next
// bookmark's start page minus one.
func chaptersFromBookmarks(bookmarks []pdfcpu.Bookmark, pageTexts map[int]string) []bookstore.Chapter {
// Flatten bookmark tree.
var flat []pdfcpu.Bookmark
var flatten func([]pdfcpu.Bookmark)
flatten = func(bms []pdfcpu.Bookmark) {
for _, bm := range bms {
flat = append(flat, bm)
flatten(bm.Kids)
}
}
flatten(bookmarks)
// Sort by page number.
sort.Slice(flat, func(i, j int) bool { return flat[i].PageFrom < flat[j].PageFrom })
// Assign PageThru for entries where it's 0 (last bookmark or missing).
maxPage := 0
for p := range pageTexts {
if p > maxPage {
maxPage = p
}
}
for i := range flat {
if flat[i].PageThru == 0 {
if i+1 < len(flat) {
flat[i].PageThru = flat[i+1].PageFrom - 1
} else {
flat[i].PageThru = maxPage
}
}
}
var chapters []bookstore.Chapter
chNum := 0
for _, bm := range flat {
if pdfSkipBookmarks[strings.ToLower(strings.TrimSpace(bm.Title))] {
for p := bodyStart; p <= maxPage; p++ {
t := strings.TrimSpace(pageTexts[p])
if t == "" {
continue
}
// Gather text for all pages in this bookmark's range.
// The first page of each chapter is typically a decorative title page
// (chapter number, subtitle art, series title) — skip it and start
// from PageFrom+1 so the content begins with actual story text.
bodyStart := bm.PageFrom + 1
if bodyStart > bm.PageThru {
bodyStart = bm.PageFrom // single-page section, use it
}
var sb strings.Builder
for p := bodyStart; p <= bm.PageThru; p++ {
if t, ok := pageTexts[p]; ok {
sb.WriteString(t)
sb.WriteByte('\n')
}
}
text := cleanChapterText(strings.TrimSpace(sb.String()))
if len(text) < 50 {
continue // skip nearly-empty sections
}
chNum++
chapters = append(chapters, bookstore.Chapter{
Number: chNum,
Title: bm.Title,
Content: text,
})
}
return chapters
}
// cleanChapterText removes decorative header fragments that sometimes appear
// at the start of the first body page when the chapter subtitle is printed
// at the top of that page (e.g. "for New Journeys!I stood atop the roof...").
//
// It strips any prefix text up to and including the last '!' or '?' that is
// immediately followed by a capital letter on the same line (a run-on from the
// title art), and removes short leading lines (< 40 chars) that look like
// title/header text rather than story content.
func cleanChapterText(text string) string {
lines := strings.Split(text, "\n")
// Find first line that is substantive story content.
// Strategy: skip short lines at the top. The first line >= 40 chars
// OR starting with an opening quote is the start of the story.
start := 0
for i, raw := range lines {
line := strings.TrimSpace(raw)
if line == "" {
start = i + 1
continue
}
// Long enough to be a real sentence fragment from a body page.
if len(line) >= 40 || strings.HasPrefix(line, "\u201C") || strings.HasPrefix(line, "\"") {
start = i
break
}
// Short line — if it ends with '!' or '?' and the NEXT non-empty
// token on the SAME line (run-on) starts a sentence, strip it.
// This catches "for New Journeys!I stood atop..." on one line.
start = i + 1 // tentatively skip this short line
sb.WriteString(t)
sb.WriteString("\n\n")
}
result := strings.TrimSpace(strings.Join(lines[start:], "\n"))
// Strip any run-on title fragment at the very start of the first line.
// Pattern: something ending with '!' or '?' immediately before a capital letter.
// e.g. "for New Journeys!I stood..." → "I stood..."
if len(result) > 0 {
// Find last '!' or '?' in the first 80 bytes that is followed by [A-Z"].
firstLine := result
if nl := strings.Index(firstLine, "\n"); nl >= 0 {
firstLine = firstLine[:nl]
}
for i, c := range firstLine {
if (c == '!' || c == '?') && i+1 < len(firstLine) {
next := rune(firstLine[i+1])
if (next >= 'A' && next <= 'Z') || next == '\u201C' || next == '"' {
// Strip up to and including this '!'/'?'
result = strings.TrimSpace(result[i+1:])
break
}
}
}
text := strings.TrimSpace(sb.String())
if text == "" {
return nil, fmt.Errorf("could not extract any text from PDF")
}
if result == "" {
return text
}
return result
return []bookstore.Chapter{{
Number: 1,
Title: "Full Text",
Content: text,
}}, nil
}
// pageNumFromFilename extracts the page number from a pdfcpu content-stream
@@ -637,6 +512,7 @@ func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
}
var chapters []bookstore.Chapter
chNum := 0
for i, href := range spineFiles {
fullPath := opfDir + href
content, err := epubFileContent(zr, fullPath)
@@ -647,12 +523,14 @@ func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
if strings.TrimSpace(text) == "" {
continue
}
chNum++
title := titleMap[href]
if title == "" {
title = fmt.Sprintf("Chapter %d", i+1)
title = fmt.Sprintf("Chapter %d", chNum)
}
_ = i // spine index unused for numbering
chapters = append(chapters, bookstore.Chapter{
Number: i + 1,
Number: chNum,
Title: title,
Content: text,
})
@@ -949,80 +827,6 @@ func htmlToText(data []byte) string {
return strings.TrimSpace(strings.Join(out, "\n"))
}
// ── Chapter segmentation (shared by PDF and plain-text paths) ─────────────────
// extractChaptersFromText splits a block of plain text into chapters by
// detecting heading lines that match chapterHeadingRE.
// Falls back to paragraph-splitting when no headings are found.
func extractChaptersFromText(text string) []bookstore.Chapter {
lines := strings.Split(text, "\n")
type segment struct {
title string
number int
lines []string
}
var segments []segment
var cur *segment
chNum := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if chapterHeadingRE.MatchString(line) {
if cur != nil {
segments = append(segments, *cur)
}
chNum++
// Try to parse the explicit chapter number from the heading.
if m := regexp.MustCompile(`\d+`).FindString(line); m != "" {
if n, err := strconv.Atoi(m); err == nil && n > 0 && n < 100000 {
chNum = n
}
}
cur = &segment{title: line, number: chNum}
} else if cur != nil && line != "" {
cur.lines = append(cur.lines, line)
}
}
if cur != nil {
segments = append(segments, *cur)
}
// Require segments to have meaningful content (>= 100 chars).
var chapters []bookstore.Chapter
for _, seg := range segments {
content := strings.Join(seg.lines, "\n")
if len(strings.TrimSpace(content)) < 50 {
continue
}
chapters = append(chapters, bookstore.Chapter{
Number: seg.number,
Title: seg.title,
Content: content,
})
}
// Fallback: no headings found — split by double newlines (paragraph blocks).
if len(chapters) == 0 {
paragraphs := strings.Split(text, "\n\n")
n := 0
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if len(para) > 100 {
n++
chapters = append(chapters, bookstore.Chapter{
Number: n,
Title: fmt.Sprintf("Chapter %d", n),
Content: para,
})
}
}
}
return chapters
}
// ── Chapter ingestion ─────────────────────────────────────────────────────────
// IngestChapters stores extracted chapters for a book.

View File

@@ -100,6 +100,58 @@
const genres = $derived(parseGenres(data.book?.genres ?? []));
const chapterList = $derived(data.chapters ?? []);
// ── Admin: split chapters (imported PDF/EPUB books) ──────────────────────
const isFullTextBook = $derived(
chapterList.length === 1 && chapterList[0].title === 'Full Text'
);
let splitText = $state('');
let splitSaving = $state(false);
let splitResult = $state<'saved' | 'error' | ''>('');
let splitError = $state('');
let splitOpen = $state(false);
$effect(() => {
// Pre-fill the textarea with chapter 1 content when the panel is opened.
if (splitOpen && !splitText && data.book?.slug && isFullTextBook) {
fetch(`/api/chapter-markdown/${encodeURIComponent(data.book.slug)}/1`)
.then((r) => r.ok ? r.text() : '')
.then((t) => {
// Strip leading "# Full Text\n\n" header if present.
splitText = t.replace(/^# Full Text\n\n/, '').trim();
})
.catch(() => {});
}
});
async function splitChapters() {
const slug = data.book?.slug;
if (splitSaving || !slug) return;
splitSaving = true;
splitResult = '';
splitError = '';
try {
const res = await fetch(`/api/admin/books/${encodeURIComponent(slug)}/split-chapters`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: splitText })
});
if (res.ok) {
splitResult = 'saved';
splitOpen = false;
await invalidateAll();
} else {
const d = await res.json().catch(() => ({}));
splitError = (d as any).error ?? 'Unknown error';
splitResult = 'error';
}
} catch (e: any) {
splitError = e?.message ?? '';
splitResult = 'error';
} finally {
splitSaving = false;
}
}
// ── Admin: rescrape ───────────────────────────────────────────────────────
let scraping = $state(false);
let scrapeResult = $state<'queued' | 'busy' | 'error' | ''>('');
@@ -979,7 +1031,7 @@
</a>
<!-- Admin panel (collapsed by default, admin only) -->
{#if data.isAdmin && book.source_url}
{#if data.isAdmin}
<div>
<button
onclick={() => (adminOpen = !adminOpen)}
@@ -997,6 +1049,62 @@
{#if adminOpen}
<div class="px-4 py-3 border-t border-(--color-border) flex flex-col gap-5">
<!-- Chapter split tool (only for imported books with single "Full Text" chapter) -->
{#if isFullTextBook}
<div class="flex flex-col gap-2">
<div class="flex items-center justify-between">
<p class="text-xs font-medium text-(--color-muted) uppercase tracking-wide">Split Chapters</p>
<button
onclick={() => { splitOpen = !splitOpen; splitResult = ''; splitError = ''; }}
class="text-xs text-(--color-muted) hover:text-(--color-text) transition-colors"
>
{splitOpen ? 'Hide' : 'Edit'}
</button>
</div>
{#if !splitOpen}
<p class="text-xs text-(--color-muted)">
This book has a single "Full Text" chapter. Use this tool to split it into chapters.
</p>
{/if}
{#if splitOpen}
<p class="text-xs text-(--color-muted)">
Insert <code class="bg-(--color-surface-3) px-1 rounded">---</code> on its own line to divide chapters.
Optionally start a segment with <code class="bg-(--color-surface-3) px-1 rounded">## Chapter Title</code>.
</p>
<textarea
bind:value={splitText}
rows="16"
class="w-full px-2 py-1.5 rounded bg-(--color-surface-3) border border-(--color-border) text-(--color-text) text-xs font-mono focus:outline-none focus:border-(--color-brand) resize-y"
placeholder="Paste or edit the full text here. Use --- to split chapters."
></textarea>
<div class="flex items-center gap-3 flex-wrap">
<button
onclick={splitChapters}
disabled={splitSaving || !splitText.trim()}
class="flex items-center gap-1.5 px-3 py-1.5 rounded text-xs font-medium transition-colors
{splitSaving || !splitText.trim() ? 'bg-(--color-surface-3) text-(--color-muted) cursor-not-allowed' : 'bg-(--color-brand)/20 text-(--color-brand-dim) hover:bg-(--color-brand)/40 border border-(--color-brand)/30'}"
>
{#if splitSaving}
<svg class="w-3 h-3 animate-spin" fill="none" viewBox="0 0 24 24"><circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"/><path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"/></svg>
Saving…
{:else}
Save chapters
{/if}
</button>
{#if splitResult === 'saved'}
<span class="text-xs text-green-400">Saved.</span>
{:else if splitResult === 'error'}
<span class="text-xs text-(--color-danger)">{splitError || 'Error.'}</span>
{/if}
</div>
{/if}
</div>
<hr class="border-(--color-border)" />
{/if}
<!-- Rescrape / range-scrape (only for scraped books with a source URL) -->
{#if book.source_url}
<!-- Rescrape -->
<div class="flex items-center gap-3 flex-wrap">
<button
@@ -1065,6 +1173,7 @@
</span>
{/if}
</div>
{/if}
<hr class="border-(--color-border)" />