- parsePDF: return all text as single 'Full Text' chapter (admin splits manually) - parseEPUB: fix chapter numbering to use sequential counter not spine index - Remove dead code: chaptersFromBookmarks, cleanChapterText, extractChaptersFromText, chapterHeadingRE; drop pdfcpu alias and regexp imports - Backend: POST /api/admin/books/:slug/split-chapters endpoint — splits text on '---' dividers, optional '## Title' headers, writes chapters via WriteChapter - UI: admin panel now shows for all admin users regardless of source_url; chapter split tool shown when book has single 'Full Text' chapter, pre-fills from MinIO content
858 lines
22 KiB
Go
858 lines
22 KiB
Go
package storage
|
||
|
||
import (
|
||
"archive/zip"
|
||
"bytes"
|
||
"context"
|
||
"fmt"
|
||
"io"
|
||
"os"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
|
||
"github.com/libnovel/backend/internal/bookstore"
|
||
"github.com/libnovel/backend/internal/domain"
|
||
minio "github.com/minio/minio-go/v7"
|
||
"github.com/pdfcpu/pdfcpu/pkg/api"
|
||
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
||
"golang.org/x/net/html"
|
||
)
|
||
|
||
type importer struct {
|
||
mc *minioClient
|
||
}
|
||
|
||
// NewBookImporter creates a BookImporter that reads files from MinIO.
|
||
func NewBookImporter(s *Store) bookstore.BookImporter {
|
||
return &importer{mc: s.mc}
|
||
}
|
||
|
||
func (i *importer) Import(ctx context.Context, objectKey, fileType string) ([]bookstore.Chapter, error) {
|
||
if fileType != "pdf" && fileType != "epub" {
|
||
return nil, fmt.Errorf("unsupported file type: %s", fileType)
|
||
}
|
||
|
||
obj, err := i.mc.client.GetObject(ctx, "imports", objectKey, minio.GetObjectOptions{})
|
||
if err != nil {
|
||
return nil, fmt.Errorf("get object from minio: %w", err)
|
||
}
|
||
defer obj.Close()
|
||
|
||
data, err := io.ReadAll(obj)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("read object: %w", err)
|
||
}
|
||
|
||
if fileType == "pdf" {
|
||
return parsePDF(data)
|
||
}
|
||
return parseEPUB(data)
|
||
}
|
||
|
||
// AnalyzeFile parses the given PDF or EPUB data and returns the detected
|
||
// chapter count and up to 3 preview lines (first non-empty line of each of
|
||
// the first 3 chapters). It is used by the analyze-only endpoint so users
|
||
// can preview chapter count before committing the import.
|
||
// Note: uses parsePDF which is backed by pdfcpu ExtractContent — fast, no hang risk.
|
||
func AnalyzeFile(data []byte, fileType string) (chapterCount int, firstLines []string, err error) {
|
||
var chapters []bookstore.Chapter
|
||
switch fileType {
|
||
case "pdf":
|
||
chapters, err = parsePDF(data)
|
||
case "epub":
|
||
chapters, err = parseEPUB(data)
|
||
default:
|
||
return 0, nil, fmt.Errorf("unsupported file type: %s", fileType)
|
||
}
|
||
if err != nil {
|
||
return 0, nil, err
|
||
}
|
||
chapterCount = len(chapters)
|
||
for i, ch := range chapters {
|
||
if i >= 3 {
|
||
break
|
||
}
|
||
line := strings.TrimSpace(ch.Content)
|
||
if nl := strings.Index(line, "\n"); nl > 0 {
|
||
line = line[:nl]
|
||
}
|
||
if len(line) > 120 {
|
||
line = line[:120] + "…"
|
||
}
|
||
firstLines = append(firstLines, line)
|
||
}
|
||
return chapterCount, firstLines, nil
|
||
}
|
||
|
||
|
||
|
||
// decryptPDF strips encryption from a PDF using an empty user password.
|
||
// Returns the decrypted bytes, or an error if decryption is not possible.
|
||
// This handles the common case of "owner-only" encrypted PDFs (copy/print
|
||
// restrictions) which use an empty user password and open normally in readers.
|
||
func decryptPDF(data []byte) ([]byte, error) {
|
||
conf := model.NewDefaultConfiguration()
|
||
conf.UserPW = ""
|
||
conf.OwnerPW = ""
|
||
|
||
var out bytes.Buffer
|
||
err := api.Decrypt(bytes.NewReader(data), &out, conf)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return out.Bytes(), nil
|
||
}
|
||
|
||
// ParseImportFile parses a PDF or EPUB and returns chapters.
|
||
// Unlike AnalyzeFile it respects ctx cancellation so callers can apply a timeout.
|
||
// For PDFs it first attempts to strip encryption with an empty password.
|
||
func ParseImportFile(ctx context.Context, data []byte, fileType string) ([]bookstore.Chapter, error) {
|
||
type result struct {
|
||
chapters []bookstore.Chapter
|
||
err error
|
||
}
|
||
ch := make(chan result, 1)
|
||
go func() {
|
||
var chapters []bookstore.Chapter
|
||
var err error
|
||
switch fileType {
|
||
case "pdf":
|
||
chapters, err = parsePDF(data)
|
||
case "epub":
|
||
chapters, err = parseEPUB(data)
|
||
default:
|
||
err = fmt.Errorf("unsupported file type: %s", fileType)
|
||
}
|
||
ch <- result{chapters, err}
|
||
}()
|
||
select {
|
||
case <-ctx.Done():
|
||
return nil, fmt.Errorf("parse timed out: %w", ctx.Err())
|
||
case r := <-ch:
|
||
return r.chapters, r.err
|
||
}
|
||
}
|
||
|
||
// pdfSkipBookmarks lists bookmark titles that are front/back matter, not story chapters.
|
||
// These are skipped when building the chapter list.
|
||
var pdfSkipBookmarks = map[string]bool{
|
||
"cover": true, "insert": true, "title page": true, "copyright": true,
|
||
"appendix": true, "color insert": true, "color illustrations": true,
|
||
}
|
||
|
||
// parsePDF extracts text from PDF bytes and returns it as a single chapter.
|
||
//
|
||
// The full readable text is returned as one chapter so the admin can manually
|
||
// split it into chapters via the UI using --- markers.
|
||
//
|
||
// Strategy:
|
||
// 1. Decrypt owner-protected PDFs (empty user password).
|
||
// 2. Extract raw content streams for every page using pdfcpu ExtractContent.
|
||
// 3. Concatenate text from all pages in order, skipping front matter
|
||
// (cover, title page, copyright — typically the first 10 pages).
|
||
func parsePDF(data []byte) ([]bookstore.Chapter, error) {
|
||
// Decrypt owner-protected PDFs (empty user password).
|
||
decrypted, err := decryptPDF(data)
|
||
if err == nil {
|
||
data = decrypted
|
||
}
|
||
|
||
conf := model.NewDefaultConfiguration()
|
||
conf.UserPW = ""
|
||
conf.OwnerPW = ""
|
||
|
||
// Extract all page content streams to a temp directory.
|
||
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
|
||
if err != nil {
|
||
return nil, fmt.Errorf("create temp dir: %w", err)
|
||
}
|
||
defer os.RemoveAll(tmpDir)
|
||
|
||
if err := api.ExtractContent(bytes.NewReader(data), tmpDir, "out", nil, conf); err != nil {
|
||
return nil, fmt.Errorf("extract PDF content: %w", err)
|
||
}
|
||
|
||
entries, err := os.ReadDir(tmpDir)
|
||
if err != nil || len(entries) == 0 {
|
||
return nil, fmt.Errorf("PDF has no content pages")
|
||
}
|
||
|
||
// Parse page number from filename and build ordered text map.
|
||
pageTexts := make(map[int]string, len(entries))
|
||
maxPage := 0
|
||
for _, e := range entries {
|
||
pageNum := pageNumFromFilename(e.Name())
|
||
if pageNum <= 0 {
|
||
continue
|
||
}
|
||
raw, readErr := os.ReadFile(tmpDir + "/" + e.Name())
|
||
if readErr != nil {
|
||
continue
|
||
}
|
||
pageTexts[pageNum] = fixWin1252(extractTextFromContentStream(raw))
|
||
if pageNum > maxPage {
|
||
maxPage = pageNum
|
||
}
|
||
}
|
||
|
||
// Determine front-matter cutoff using bookmarks if available,
|
||
// otherwise skip the first 10 pages (cover/title/copyright).
|
||
bodyStart := 1
|
||
bookmarks, bmErr := api.Bookmarks(bytes.NewReader(data), conf)
|
||
if bmErr == nil {
|
||
for _, bm := range bookmarks {
|
||
title := strings.ToLower(strings.TrimSpace(bm.Title))
|
||
if !pdfSkipBookmarks[title] && bm.PageFrom > 0 {
|
||
// First non-front-matter bookmark — body starts here.
|
||
bodyStart = bm.PageFrom
|
||
break
|
||
}
|
||
}
|
||
} else if maxPage > 10 {
|
||
bodyStart = 11
|
||
}
|
||
|
||
// Concatenate all body pages.
|
||
var sb strings.Builder
|
||
for p := bodyStart; p <= maxPage; p++ {
|
||
t := strings.TrimSpace(pageTexts[p])
|
||
if t == "" {
|
||
continue
|
||
}
|
||
sb.WriteString(t)
|
||
sb.WriteString("\n\n")
|
||
}
|
||
|
||
text := strings.TrimSpace(sb.String())
|
||
if text == "" {
|
||
return nil, fmt.Errorf("could not extract any text from PDF")
|
||
}
|
||
|
||
return []bookstore.Chapter{{
|
||
Number: 1,
|
||
Title: "Full Text",
|
||
Content: text,
|
||
}}, nil
|
||
}
|
||
|
||
// pageNumFromFilename extracts the page number from a pdfcpu content-stream
|
||
// filename like "out_Content_page_42.txt". Returns 0 if not parseable.
|
||
func pageNumFromFilename(name string) int {
|
||
// Strip directory prefix and extension.
|
||
base := name
|
||
if idx := strings.LastIndex(base, "/"); idx >= 0 {
|
||
base = base[idx+1:]
|
||
}
|
||
if idx := strings.LastIndex(base, "."); idx >= 0 {
|
||
base = base[:idx]
|
||
}
|
||
// Find last "_" and parse the number after it.
|
||
if idx := strings.LastIndex(base, "_"); idx >= 0 {
|
||
n, err := strconv.Atoi(base[idx+1:])
|
||
if err == nil && n > 0 {
|
||
return n
|
||
}
|
||
}
|
||
return 0
|
||
}
|
||
|
||
// win1252ToUnicode maps the Windows-1252 control range 0x80–0x9F to the
|
||
// Unicode characters they actually represent in that encoding.
|
||
// Standard Latin-1 maps these bytes to control characters; Win-1252 maps
|
||
// them to typographic symbols that appear in publisher PDFs.
|
||
var win1252ToUnicode = map[byte]rune{
|
||
0x80: '\u20AC', // €
|
||
0x82: '\u201A', // ‚
|
||
0x83: '\u0192', // ƒ
|
||
0x84: '\u201E', // „
|
||
0x85: '\u2026', // …
|
||
0x86: '\u2020', // †
|
||
0x87: '\u2021', // ‡
|
||
0x88: '\u02C6', // ˆ
|
||
0x89: '\u2030', // ‰
|
||
0x8A: '\u0160', // Š
|
||
0x8B: '\u2039', // ‹
|
||
0x8C: '\u0152', // Œ
|
||
0x8E: '\u017D', // Ž
|
||
0x91: '\u2018', // ' (left single quotation mark)
|
||
0x92: '\u2019', // ' (right single quotation mark / apostrophe)
|
||
0x93: '\u201C', // " (left double quotation mark)
|
||
0x94: '\u201D', // " (right double quotation mark)
|
||
0x95: '\u2022', // • (bullet)
|
||
0x96: '\u2013', // – (en dash)
|
||
0x97: '\u2014', // — (em dash)
|
||
0x98: '\u02DC', // ˜
|
||
0x99: '\u2122', // ™
|
||
0x9A: '\u0161', // š
|
||
0x9B: '\u203A', // ›
|
||
0x9C: '\u0153', // œ
|
||
0x9E: '\u017E', // ž
|
||
0x9F: '\u0178', // Ÿ
|
||
}
|
||
|
||
// fixWin1252 replaces Windows-1252 specific bytes (0x80–0x9F) in a string
|
||
// that was decoded as raw Latin-1 bytes with their proper Unicode equivalents.
|
||
func fixWin1252(s string) string {
|
||
// Fast path: if no bytes in 0x80–0x9F range, return unchanged.
|
||
needsFix := false
|
||
for i := 0; i < len(s); i++ {
|
||
b := s[i]
|
||
if b >= 0x80 && b <= 0x9F {
|
||
needsFix = true
|
||
break
|
||
}
|
||
}
|
||
if !needsFix {
|
||
return s
|
||
}
|
||
var sb strings.Builder
|
||
sb.Grow(len(s))
|
||
for i := 0; i < len(s); i++ {
|
||
b := s[i]
|
||
if b >= 0x80 && b <= 0x9F {
|
||
if r, ok := win1252ToUnicode[b]; ok {
|
||
sb.WriteRune(r)
|
||
continue
|
||
}
|
||
}
|
||
sb.WriteByte(b)
|
||
}
|
||
return sb.String()
|
||
}
|
||
|
||
// extractTextFromContentStream parses a raw PDF content stream and extracts
|
||
// readable text from Tj and TJ operators.
|
||
//
|
||
// TJ arrays may contain a mix of literal strings (parenthesised) and hex glyph
|
||
// arrays. Only the literal strings are decoded — hex arrays require per-font
|
||
// ToUnicode CMaps and are skipped. Kerning adjustment numbers inside TJ arrays
|
||
// are also ignored (they're just spacing hints).
|
||
//
|
||
// Line breaks are inserted on ET / Td / TD / T* operators.
|
||
func extractTextFromContentStream(stream []byte) string {
|
||
s := string(stream)
|
||
var sb strings.Builder
|
||
i := 0
|
||
n := len(s)
|
||
for i < n {
|
||
// TJ array: [ ... ]TJ — collect all literal strings, skip hex & numbers.
|
||
if s[i] == '[' {
|
||
j := i + 1
|
||
for j < n && s[j] != ']' {
|
||
if s[j] == '(' {
|
||
// Literal string inside TJ array.
|
||
k := j + 1
|
||
depth := 1
|
||
for k < n && depth > 0 {
|
||
if s[k] == '\\' {
|
||
k += 2
|
||
continue
|
||
}
|
||
if s[k] == '(' {
|
||
depth++
|
||
} else if s[k] == ')' {
|
||
depth--
|
||
}
|
||
k++
|
||
}
|
||
lit := pdfUnescapeString(s[j+1 : k-1])
|
||
if hasPrintableASCII(lit) {
|
||
sb.WriteString(lit)
|
||
}
|
||
j = k
|
||
continue
|
||
}
|
||
j++
|
||
}
|
||
// Check if this is a TJ operator (skip whitespace after ']').
|
||
end := j + 1
|
||
for end < n && (s[end] == ' ' || s[end] == '\t' || s[end] == '\r' || s[end] == '\n') {
|
||
end++
|
||
}
|
||
if end+2 <= n && s[end:end+2] == "TJ" && (end+2 == n || !isAlphaNum(s[end+2])) {
|
||
i = end + 2
|
||
continue
|
||
}
|
||
i = j + 1
|
||
continue
|
||
}
|
||
// Single string: (string) Tj
|
||
if s[i] == '(' {
|
||
j := i + 1
|
||
depth := 1
|
||
for j < n && depth > 0 {
|
||
if s[j] == '\\' {
|
||
j += 2
|
||
continue
|
||
}
|
||
if s[j] == '(' {
|
||
depth++
|
||
} else if s[j] == ')' {
|
||
depth--
|
||
}
|
||
j++
|
||
}
|
||
lit := pdfUnescapeString(s[i+1 : j-1])
|
||
if hasPrintableASCII(lit) {
|
||
// Check for Tj operator.
|
||
end := j
|
||
for end < n && (s[end] == ' ' || s[end] == '\t') {
|
||
end++
|
||
}
|
||
if end+2 <= n && s[end:end+2] == "Tj" && (end+2 == n || !isAlphaNum(s[end+2])) {
|
||
sb.WriteString(lit)
|
||
i = end + 2
|
||
continue
|
||
}
|
||
}
|
||
i = j
|
||
continue
|
||
}
|
||
// Detect end of text object (ET) — add a newline.
|
||
if i+2 <= n && s[i:i+2] == "ET" && (i+2 == n || !isAlphaNum(s[i+2])) {
|
||
sb.WriteByte('\n')
|
||
i += 2
|
||
continue
|
||
}
|
||
// Detect Td / TD / T* — newline within text block.
|
||
if i+2 <= n && (s[i:i+2] == "Td" || s[i:i+2] == "TD" || s[i:i+2] == "T*") &&
|
||
(i+2 == n || !isAlphaNum(s[i+2])) {
|
||
sb.WriteByte('\n')
|
||
i += 2
|
||
continue
|
||
}
|
||
i++
|
||
}
|
||
return sb.String()
|
||
}
|
||
|
||
func isAlphaNum(b byte) bool {
|
||
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'
|
||
}
|
||
|
||
func hasPrintableASCII(s string) bool {
|
||
for _, c := range s {
|
||
if c >= 0x20 && c < 0x7F {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// pdfUnescapeString handles PDF string escape sequences.
|
||
func pdfUnescapeString(s string) string {
|
||
if !strings.ContainsRune(s, '\\') {
|
||
return s
|
||
}
|
||
var sb strings.Builder
|
||
i := 0
|
||
for i < len(s) {
|
||
if s[i] == '\\' && i+1 < len(s) {
|
||
switch s[i+1] {
|
||
case 'n':
|
||
sb.WriteByte('\n')
|
||
case 'r':
|
||
sb.WriteByte('\r')
|
||
case 't':
|
||
sb.WriteByte('\t')
|
||
case '(', ')', '\\':
|
||
sb.WriteByte(s[i+1])
|
||
default:
|
||
// Octal escape \ddd
|
||
if s[i+1] >= '0' && s[i+1] <= '7' {
|
||
end := i + 2
|
||
for end < i+5 && end < len(s) && s[end] >= '0' && s[end] <= '7' {
|
||
end++
|
||
}
|
||
val, _ := strconv.ParseInt(s[i+1:end], 8, 16)
|
||
sb.WriteByte(byte(val))
|
||
i = end
|
||
continue
|
||
}
|
||
sb.WriteByte(s[i+1])
|
||
}
|
||
i += 2
|
||
} else {
|
||
sb.WriteByte(s[i])
|
||
i++
|
||
}
|
||
}
|
||
return sb.String()
|
||
}
|
||
|
||
// ── EPUB parsing ──────────────────────────────────────────────────────────────
|
||
|
||
func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
|
||
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
||
if err != nil {
|
||
return nil, fmt.Errorf("open EPUB zip: %w", err)
|
||
}
|
||
|
||
// 1. Read META-INF/container.xml → find rootfile (content.opf path).
|
||
opfPath, err := epubRootfilePath(zr)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("epub container: %w", err)
|
||
}
|
||
|
||
// 2. Parse content.opf → spine order of chapter files.
|
||
spineFiles, titleMap, err := epubSpine(zr, opfPath)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("epub spine: %w", err)
|
||
}
|
||
|
||
if len(spineFiles) == 0 {
|
||
return nil, fmt.Errorf("EPUB spine is empty")
|
||
}
|
||
|
||
// Base directory of the OPF file for resolving relative hrefs.
|
||
opfDir := ""
|
||
if idx := strings.LastIndex(opfPath, "/"); idx >= 0 {
|
||
opfDir = opfPath[:idx+1]
|
||
}
|
||
|
||
var chapters []bookstore.Chapter
|
||
chNum := 0
|
||
for i, href := range spineFiles {
|
||
fullPath := opfDir + href
|
||
content, err := epubFileContent(zr, fullPath)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
text := htmlToText(content)
|
||
if strings.TrimSpace(text) == "" {
|
||
continue
|
||
}
|
||
chNum++
|
||
title := titleMap[href]
|
||
if title == "" {
|
||
title = fmt.Sprintf("Chapter %d", chNum)
|
||
}
|
||
_ = i // spine index unused for numbering
|
||
chapters = append(chapters, bookstore.Chapter{
|
||
Number: chNum,
|
||
Title: title,
|
||
Content: text,
|
||
})
|
||
}
|
||
|
||
if len(chapters) == 0 {
|
||
return nil, fmt.Errorf("no readable chapters found in EPUB")
|
||
}
|
||
return chapters, nil
|
||
}
|
||
|
||
// epubRootfilePath parses META-INF/container.xml and returns the full-path
|
||
// of the OPF package document.
|
||
func epubRootfilePath(zr *zip.Reader) (string, error) {
|
||
f := zipFile(zr, "META-INF/container.xml")
|
||
if f == nil {
|
||
return "", fmt.Errorf("META-INF/container.xml not found")
|
||
}
|
||
rc, err := f.Open()
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
defer rc.Close()
|
||
|
||
doc, err := html.Parse(rc)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
var path string
|
||
var walk func(*html.Node)
|
||
walk = func(n *html.Node) {
|
||
if n.Type == html.ElementNode && strings.EqualFold(n.Data, "rootfile") {
|
||
for _, a := range n.Attr {
|
||
if strings.EqualFold(a.Key, "full-path") {
|
||
path = a.Val
|
||
return
|
||
}
|
||
}
|
||
}
|
||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
walk(c)
|
||
}
|
||
}
|
||
walk(doc)
|
||
|
||
if path == "" {
|
||
return "", fmt.Errorf("rootfile full-path not found in container.xml")
|
||
}
|
||
return path, nil
|
||
}
|
||
|
||
// epubSpine parses the OPF document and returns the spine item hrefs in order,
|
||
// plus a map from href → nav title (if available from NCX/NAV).
|
||
func epubSpine(zr *zip.Reader, opfPath string) ([]string, map[string]string, error) {
|
||
f := zipFile(zr, opfPath)
|
||
if f == nil {
|
||
return nil, nil, fmt.Errorf("OPF file %q not found in EPUB", opfPath)
|
||
}
|
||
rc, err := f.Open()
|
||
if err != nil {
|
||
return nil, nil, err
|
||
}
|
||
defer rc.Close()
|
||
|
||
opfData, err := io.ReadAll(rc)
|
||
if err != nil {
|
||
return nil, nil, err
|
||
}
|
||
|
||
// Build id→href map from <manifest>.
|
||
idToHref := make(map[string]string)
|
||
// Also keep a href→navTitle map (populated from NCX later).
|
||
hrefTitle := make(map[string]string)
|
||
|
||
// Parse OPF XML with html.Parse (handles malformed XML too).
|
||
doc, _ := html.Parse(bytes.NewReader(opfData))
|
||
|
||
var manifestItems []struct{ id, href, mediaType string }
|
||
var spineIdrefs []string
|
||
var ncxID string
|
||
|
||
var walk func(*html.Node)
|
||
walk = func(n *html.Node) {
|
||
if n.Type == html.ElementNode {
|
||
tag := strings.ToLower(n.Data)
|
||
switch tag {
|
||
case "item":
|
||
var id, href, mt string
|
||
for _, a := range n.Attr {
|
||
switch strings.ToLower(a.Key) {
|
||
case "id":
|
||
id = a.Val
|
||
case "href":
|
||
href = a.Val
|
||
case "media-type":
|
||
mt = a.Val
|
||
}
|
||
}
|
||
if id != "" && href != "" {
|
||
manifestItems = append(manifestItems, struct{ id, href, mediaType string }{id, href, mt})
|
||
idToHref[id] = href
|
||
}
|
||
case "itemref":
|
||
for _, a := range n.Attr {
|
||
if strings.ToLower(a.Key) == "idref" {
|
||
spineIdrefs = append(spineIdrefs, a.Val)
|
||
}
|
||
}
|
||
case "spine":
|
||
for _, a := range n.Attr {
|
||
if strings.ToLower(a.Key) == "toc" {
|
||
ncxID = a.Val
|
||
}
|
||
}
|
||
}
|
||
}
|
||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
walk(c)
|
||
}
|
||
}
|
||
walk(doc)
|
||
|
||
// Build ordered spine href list.
|
||
var spineHrefs []string
|
||
for _, idref := range spineIdrefs {
|
||
if href, ok := idToHref[idref]; ok {
|
||
spineHrefs = append(spineHrefs, href)
|
||
}
|
||
}
|
||
|
||
// If no explicit spine, fall back to all XHTML items in manifest order.
|
||
if len(spineHrefs) == 0 {
|
||
sort.Slice(manifestItems, func(i, j int) bool {
|
||
return manifestItems[i].href < manifestItems[j].href
|
||
})
|
||
for _, it := range manifestItems {
|
||
mt := strings.ToLower(it.mediaType)
|
||
if strings.Contains(mt, "html") || strings.HasSuffix(strings.ToLower(it.href), ".html") || strings.HasSuffix(strings.ToLower(it.href), ".xhtml") {
|
||
spineHrefs = append(spineHrefs, it.href)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Try to get chapter titles from NCX (toc.ncx).
|
||
opfDir := ""
|
||
if idx := strings.LastIndex(opfPath, "/"); idx >= 0 {
|
||
opfDir = opfPath[:idx+1]
|
||
}
|
||
if ncxHref, ok := idToHref[ncxID]; ok {
|
||
ncxPath := opfDir + ncxHref
|
||
if ncxFile := zipFile(zr, ncxPath); ncxFile != nil {
|
||
if ncxRC, err := ncxFile.Open(); err == nil {
|
||
defer ncxRC.Close()
|
||
parseNCXTitles(ncxRC, hrefTitle)
|
||
}
|
||
}
|
||
}
|
||
|
||
return spineHrefs, hrefTitle, nil
|
||
}
|
||
|
||
// parseNCXTitles extracts navPoint label→src mappings from a toc.ncx.
|
||
func parseNCXTitles(r io.Reader, out map[string]string) {
|
||
doc, err := html.Parse(r)
|
||
if err != nil {
|
||
return
|
||
}
|
||
|
||
// Collect navPoints: each has a <navLabel><text>…</text></navLabel> and
|
||
// a <content src="…"/> child.
|
||
var walk func(*html.Node)
|
||
walk = func(n *html.Node) {
|
||
if n.Type == html.ElementNode && strings.EqualFold(n.Data, "navpoint") {
|
||
var label, src string
|
||
var inner func(*html.Node)
|
||
inner = func(c *html.Node) {
|
||
if c.Type == html.ElementNode {
|
||
if strings.EqualFold(c.Data, "text") && label == "" {
|
||
if c.FirstChild != nil && c.FirstChild.Type == html.TextNode {
|
||
label = strings.TrimSpace(c.FirstChild.Data)
|
||
}
|
||
}
|
||
if strings.EqualFold(c.Data, "content") {
|
||
for _, a := range c.Attr {
|
||
if strings.EqualFold(a.Key, "src") {
|
||
// Strip fragment identifier (#...).
|
||
src = strings.SplitN(a.Val, "#", 2)[0]
|
||
}
|
||
}
|
||
}
|
||
}
|
||
for child := c.FirstChild; child != nil; child = child.NextSibling {
|
||
inner(child)
|
||
}
|
||
}
|
||
inner(n)
|
||
if label != "" && src != "" {
|
||
out[src] = label
|
||
}
|
||
}
|
||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
walk(c)
|
||
}
|
||
}
|
||
walk(doc)
|
||
}
|
||
|
||
// epubFileContent returns the raw bytes of a file inside the EPUB zip.
|
||
func epubFileContent(zr *zip.Reader, path string) ([]byte, error) {
|
||
f := zipFile(zr, path)
|
||
if f == nil {
|
||
return nil, fmt.Errorf("file %q not in EPUB", path)
|
||
}
|
||
rc, err := f.Open()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer rc.Close()
|
||
return io.ReadAll(rc)
|
||
}
|
||
|
||
// zipFile finds a file by name (case-insensitive) in a zip.Reader.
|
||
func zipFile(zr *zip.Reader, name string) *zip.File {
|
||
nameLower := strings.ToLower(name)
|
||
for _, f := range zr.File {
|
||
if strings.ToLower(f.Name) == nameLower {
|
||
return f
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// htmlToText converts HTML/XHTML content to plain text suitable for storage.
|
||
func htmlToText(data []byte) string {
|
||
doc, err := html.Parse(bytes.NewReader(data))
|
||
if err != nil {
|
||
return string(data)
|
||
}
|
||
|
||
var sb strings.Builder
|
||
var walk func(*html.Node)
|
||
walk = func(n *html.Node) {
|
||
if n.Type == html.TextNode {
|
||
text := strings.TrimSpace(n.Data)
|
||
if text != "" {
|
||
sb.WriteString(text)
|
||
sb.WriteByte(' ')
|
||
}
|
||
}
|
||
if n.Type == html.ElementNode {
|
||
switch strings.ToLower(n.Data) {
|
||
case "p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr":
|
||
// Block-level: ensure newline before content.
|
||
if sb.Len() > 0 {
|
||
s := sb.String()
|
||
if s[len(s)-1] != '\n' {
|
||
sb.WriteByte('\n')
|
||
}
|
||
}
|
||
case "script", "style", "head":
|
||
// Skip entirely.
|
||
return
|
||
}
|
||
}
|
||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
walk(c)
|
||
}
|
||
if n.Type == html.ElementNode {
|
||
switch strings.ToLower(n.Data) {
|
||
case "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr":
|
||
sb.WriteByte('\n')
|
||
}
|
||
}
|
||
}
|
||
walk(doc)
|
||
|
||
// Collapse multiple blank lines.
|
||
lines := strings.Split(sb.String(), "\n")
|
||
var out []string
|
||
blanks := 0
|
||
for _, l := range lines {
|
||
l = strings.TrimSpace(l)
|
||
if l == "" {
|
||
blanks++
|
||
if blanks <= 1 {
|
||
out = append(out, "")
|
||
}
|
||
} else {
|
||
blanks = 0
|
||
out = append(out, l)
|
||
}
|
||
}
|
||
return strings.TrimSpace(strings.Join(out, "\n"))
|
||
}
|
||
|
||
// ── Chapter ingestion ─────────────────────────────────────────────────────────
|
||
|
||
// IngestChapters stores extracted chapters for a book.
|
||
// Each chapter is written as a markdown file in the chapters MinIO bucket
|
||
// and its index record is upserted in PocketBase via WriteChapter.
|
||
func (s *Store) IngestChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
|
||
for _, ch := range chapters {
|
||
var mdContent string
|
||
if ch.Title != "" && ch.Title != fmt.Sprintf("Chapter %d", ch.Number) {
|
||
mdContent = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
|
||
} else {
|
||
mdContent = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
|
||
}
|
||
domainCh := domain.Chapter{
|
||
Ref: domain.ChapterRef{Number: ch.Number, Title: ch.Title},
|
||
Text: mdContent,
|
||
}
|
||
if err := s.WriteChapter(ctx, slug, domainCh); err != nil {
|
||
return fmt.Errorf("ingest chapter %d: %w", ch.Number, err)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// GetImportObjectKey returns the MinIO object key for an uploaded import file.
|
||
func GetImportObjectKey(filename string) string {
|
||
return fmt.Sprintf("imports/%s", filename)
|
||
}
|