libnovel/backend/internal/storage/import.go

package storage

import (
	"archive/zip"
	"bytes"
	"context"
	"fmt"
	"io"
	"os"
	"sort"
	"strconv"
	"strings"

	"github.com/libnovel/backend/internal/bookstore"
	"github.com/libnovel/backend/internal/domain"
	minio "github.com/minio/minio-go/v7"
	"github.com/pdfcpu/pdfcpu/pkg/api"
	"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
	"golang.org/x/net/html"
)

type importer struct {
	mc *minioClient
}

// NewBookImporter creates a BookImporter that reads files from MinIO.
func NewBookImporter(s *Store) bookstore.BookImporter {
	return &importer{mc: s.mc}
}

func (i *importer) Import(ctx context.Context, objectKey, fileType string) ([]bookstore.Chapter, error) {
	if fileType != "pdf" && fileType != "epub" {
		return nil, fmt.Errorf("unsupported file type: %s", fileType)
	}

	obj, err := i.mc.client.GetObject(ctx, "imports", objectKey, minio.GetObjectOptions{})
	if err != nil {
		return nil, fmt.Errorf("get object from minio: %w", err)
	}
	defer obj.Close()

	data, err := io.ReadAll(obj)
	if err != nil {
		return nil, fmt.Errorf("read object: %w", err)
	}

	if fileType == "pdf" {
		return parsePDF(data)
	}
	return parseEPUB(data)
}

// AnalyzeFile parses the given PDF or EPUB data and returns the detected
// chapter count and up to 3 preview lines (first non-empty line of each of
// the first 3 chapters). It is used by the analyze-only endpoint so users
// can preview chapter count before committing the import.
// Note: uses parsePDF which is backed by pdfcpu ExtractContent — fast, no hang risk.
func AnalyzeFile(data []byte, fileType string) (chapterCount int, firstLines []string, err error) {
	var chapters []bookstore.Chapter
	switch fileType {
	case "pdf":
		chapters, err = parsePDF(data)
	case "epub":
		chapters, err = parseEPUB(data)
	default:
		return 0, nil, fmt.Errorf("unsupported file type: %s", fileType)
	}
	if err != nil {
		return 0, nil, err
	}
	chapterCount = len(chapters)
	for i, ch := range chapters {
		if i >= 3 {
			break
		}
		line := strings.TrimSpace(ch.Content)
		if nl := strings.Index(line, "\n"); nl > 0 {
			line = line[:nl]
		}
		if len(line) > 120 {
			line = line[:120] + "…"
		}
		firstLines = append(firstLines, line)
	}
	return chapterCount, firstLines, nil
}


// decryptPDF strips encryption from a PDF using an empty user password.
// Returns the decrypted bytes, or an error if decryption is not possible.
// This handles the common case of "owner-only" encrypted PDFs (copy/print
// restrictions) which use an empty user password and open normally in readers.
func decryptPDF(data []byte) ([]byte, error) {
	conf := model.NewDefaultConfiguration()
	conf.UserPW = ""
	conf.OwnerPW = ""

	var out bytes.Buffer
	err := api.Decrypt(bytes.NewReader(data), &out, conf)
	if err != nil {
		return nil, err
	}
	return out.Bytes(), nil
}

// ParseImportFile parses a PDF or EPUB and returns chapters.
// Unlike AnalyzeFile it respects ctx cancellation so callers can apply a timeout.
// For PDFs it first attempts to strip encryption with an empty password.
func ParseImportFile(ctx context.Context, data []byte, fileType string) ([]bookstore.Chapter, error) {
	type result struct {
		chapters []bookstore.Chapter
		err      error
	}
	ch := make(chan result, 1)
	go func() {
		var chapters []bookstore.Chapter
		var err error
		switch fileType {
		case "pdf":
			chapters, err = parsePDF(data)
		case "epub":
			chapters, err = parseEPUB(data)
		default:
			err = fmt.Errorf("unsupported file type: %s", fileType)
		}
		ch <- result{chapters, err}
	}()
	select {
	case <-ctx.Done():
		return nil, fmt.Errorf("parse timed out: %w", ctx.Err())
	case r := <-ch:
		return r.chapters, r.err
	}
}

// pdfSkipBookmarks lists bookmark titles that are front/back matter, not story chapters.
// These are skipped when building the chapter list.
var pdfSkipBookmarks = map[string]bool{
	"cover": true, "insert": true, "title page": true, "copyright": true,
	"appendix": true, "color insert": true, "color illustrations": true,
}

// parsePDF extracts text from PDF bytes and returns it as a single chapter.
//
// The full readable text is returned as one chapter so the admin can manually
// split it into chapters via the UI using --- markers.
//
// Strategy:
//  1. Decrypt owner-protected PDFs (empty user password).
//  2. Extract raw content streams for every page using pdfcpu ExtractContent.
//  3. Concatenate text from all pages in order, skipping front matter
//     (cover, title page, copyright — typically the first 10 pages).
func parsePDF(data []byte) ([]bookstore.Chapter, error) {
	// Decrypt owner-protected PDFs (empty user password).
	decrypted, err := decryptPDF(data)
	if err == nil {
		data = decrypted
	}

	conf := model.NewDefaultConfiguration()
	conf.UserPW = ""
	conf.OwnerPW = ""

	// Extract all page content streams to a temp directory.
	tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
	if err != nil {
		return nil, fmt.Errorf("create temp dir: %w", err)
	}
	defer os.RemoveAll(tmpDir)

	if err := api.ExtractContent(bytes.NewReader(data), tmpDir, "out", nil, conf); err != nil {
		return nil, fmt.Errorf("extract PDF content: %w", err)
	}

	entries, err := os.ReadDir(tmpDir)
	if err != nil || len(entries) == 0 {
		return nil, fmt.Errorf("PDF has no content pages")
	}

	// Parse page number from filename and build ordered text map.
	pageTexts := make(map[int]string, len(entries))
	maxPage := 0
	for _, e := range entries {
		pageNum := pageNumFromFilename(e.Name())
		if pageNum <= 0 {
			continue
		}
		raw, readErr := os.ReadFile(tmpDir + "/" + e.Name())
		if readErr != nil {
			continue
		}
		pageTexts[pageNum] = fixWin1252(extractTextFromContentStream(raw))
		if pageNum > maxPage {
			maxPage = pageNum
		}
	}

	// Determine front-matter cutoff using bookmarks if available,
	// otherwise skip the first 10 pages (cover/title/copyright).
	bodyStart := 1
	bookmarks, bmErr := api.Bookmarks(bytes.NewReader(data), conf)
	if bmErr == nil {
		for _, bm := range bookmarks {
			title := strings.ToLower(strings.TrimSpace(bm.Title))
			if !pdfSkipBookmarks[title] && bm.PageFrom > 0 {
				// First non-front-matter bookmark — body starts here.
				bodyStart = bm.PageFrom
				break
			}
		}
	} else if maxPage > 10 {
		bodyStart = 11
	}

	// Concatenate all body pages.
	var sb strings.Builder
	for p := bodyStart; p <= maxPage; p++ {
		t := strings.TrimSpace(pageTexts[p])
		if t == "" {
			continue
		}
		sb.WriteString(t)
		sb.WriteString("\n\n")
	}

	text := strings.TrimSpace(sb.String())
	if text == "" {
		return nil, fmt.Errorf("could not extract any text from PDF")
	}

	return []bookstore.Chapter{{
		Number:  1,
		Title:   "Full Text",
		Content: text,
	}}, nil
}

// pageNumFromFilename extracts the page number from a pdfcpu content-stream
// filename like "out_Content_page_42.txt". Returns 0 if not parseable.
func pageNumFromFilename(name string) int {
	// Strip directory prefix and extension.
	base := name
	if idx := strings.LastIndex(base, "/"); idx >= 0 {
		base = base[idx+1:]
	}
	if idx := strings.LastIndex(base, "."); idx >= 0 {
		base = base[:idx]
	}
	// Find last "_" and parse the number after it.
	if idx := strings.LastIndex(base, "_"); idx >= 0 {
		n, err := strconv.Atoi(base[idx+1:])
		if err == nil && n > 0 {
			return n
		}
	}
	return 0
}

// win1252ToUnicode maps the Windows-1252 control range 0x80–0x9F to the
// Unicode characters they actually represent in that encoding.
// Standard Latin-1 maps these bytes to control characters; Win-1252 maps
// them to typographic symbols that appear in publisher PDFs.
var win1252ToUnicode = map[byte]rune{
	0x80: '\u20AC', // €
	0x82: '\u201A', // ‚
	0x83: '\u0192', // ƒ
	0x84: '\u201E', // „
	0x85: '\u2026', // …
	0x86: '\u2020', // †
	0x87: '\u2021', // ‡
	0x88: '\u02C6', // ˆ
	0x89: '\u2030', // ‰
	0x8A: '\u0160', // Š
	0x8B: '\u2039', // ‹
	0x8C: '\u0152', // Œ
	0x8E: '\u017D', // Ž
	0x91: '\u2018', // ' (left single quotation mark)
	0x92: '\u2019', // ' (right single quotation mark / apostrophe)
	0x93: '\u201C', // " (left double quotation mark)
	0x94: '\u201D', // " (right double quotation mark)
	0x95: '\u2022', // • (bullet)
	0x96: '\u2013', // – (en dash)
	0x97: '\u2014', // — (em dash)
	0x98: '\u02DC', // ˜
	0x99: '\u2122', // ™
	0x9A: '\u0161', // š
	0x9B: '\u203A', // ›
	0x9C: '\u0153', // œ
	0x9E: '\u017E', // ž
	0x9F: '\u0178', // Ÿ
}

// fixWin1252 replaces Windows-1252 specific bytes (0x80–0x9F) in a string
// that was decoded as raw Latin-1 bytes with their proper Unicode equivalents.
func fixWin1252(s string) string {
	// Fast path: if no bytes in 0x80–0x9F range, return unchanged.
	needsFix := false
	for i := 0; i < len(s); i++ {
		b := s[i]
		if b >= 0x80 && b <= 0x9F {
			needsFix = true
			break
		}
	}
	if !needsFix {
		return s
	}
	var sb strings.Builder
	sb.Grow(len(s))
	for i := 0; i < len(s); i++ {
		b := s[i]
		if b >= 0x80 && b <= 0x9F {
			if r, ok := win1252ToUnicode[b]; ok {
				sb.WriteRune(r)
				continue
			}
		}
		sb.WriteByte(b)
	}
	return sb.String()
}

// extractTextFromContentStream parses a raw PDF content stream and extracts
// readable text from Tj and TJ operators.
//
// TJ arrays may contain a mix of literal strings (parenthesised) and hex glyph
// arrays. Only the literal strings are decoded — hex arrays require per-font
// ToUnicode CMaps and are skipped. Kerning adjustment numbers inside TJ arrays
// are also ignored (they're just spacing hints).
//
// Line breaks are inserted on ET / Td / TD / T* operators.
func extractTextFromContentStream(stream []byte) string {
	s := string(stream)
	var sb strings.Builder
	i := 0
	n := len(s)
	for i < n {
		// TJ array: [ ... ]TJ — collect all literal strings, skip hex & numbers.
		if s[i] == '[' {
			j := i + 1
			for j < n && s[j] != ']' {
				if s[j] == '(' {
					// Literal string inside TJ array.
					k := j + 1
					depth := 1
					for k < n && depth > 0 {
						if s[k] == '\\' {
							k += 2
							continue
						}
						if s[k] == '(' {
							depth++
						} else if s[k] == ')' {
							depth--
						}
						k++
					}
					lit := pdfUnescapeString(s[j+1 : k-1])
					if hasPrintableASCII(lit) {
						sb.WriteString(lit)
					}
					j = k
					continue
				}
				j++
			}
			// Check if this is a TJ operator (skip whitespace after ']').
			end := j + 1
			for end < n && (s[end] == ' ' || s[end] == '\t' || s[end] == '\r' || s[end] == '\n') {
				end++
			}
			if end+2 <= n && s[end:end+2] == "TJ" && (end+2 == n || !isAlphaNum(s[end+2])) {
				i = end + 2
				continue
			}
			i = j + 1
			continue
		}
		// Single string: (string) Tj
		if s[i] == '(' {
			j := i + 1
			depth := 1
			for j < n && depth > 0 {
				if s[j] == '\\' {
					j += 2
					continue
				}
				if s[j] == '(' {
					depth++
				} else if s[j] == ')' {
					depth--
				}
				j++
			}
			lit := pdfUnescapeString(s[i+1 : j-1])
			if hasPrintableASCII(lit) {
				// Check for Tj operator.
				end := j
				for end < n && (s[end] == ' ' || s[end] == '\t') {
					end++
				}
				if end+2 <= n && s[end:end+2] == "Tj" && (end+2 == n || !isAlphaNum(s[end+2])) {
					sb.WriteString(lit)
					i = end + 2
					continue
				}
			}
			i = j
			continue
		}
		// Detect end of text object (ET) — add a newline.
		if i+2 <= n && s[i:i+2] == "ET" && (i+2 == n || !isAlphaNum(s[i+2])) {
			sb.WriteByte('\n')
			i += 2
			continue
		}
		// Detect Td / TD / T* — newline within text block.
		if i+2 <= n && (s[i:i+2] == "Td" || s[i:i+2] == "TD" || s[i:i+2] == "T*") &&
			(i+2 == n || !isAlphaNum(s[i+2])) {
			sb.WriteByte('\n')
			i += 2
			continue
		}
		i++
	}
	return sb.String()
}

func isAlphaNum(b byte) bool {
	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'
}

func hasPrintableASCII(s string) bool {
	for _, c := range s {
		if c >= 0x20 && c < 0x7F {
			return true
		}
	}
	return false
}

// pdfUnescapeString handles PDF string escape sequences.
func pdfUnescapeString(s string) string {
	if !strings.ContainsRune(s, '\\') {
		return s
	}
	var sb strings.Builder
	i := 0
	for i < len(s) {
		if s[i] == '\\' && i+1 < len(s) {
			switch s[i+1] {
			case 'n':
				sb.WriteByte('\n')
			case 'r':
				sb.WriteByte('\r')
			case 't':
				sb.WriteByte('\t')
			case '(', ')', '\\':
				sb.WriteByte(s[i+1])
			default:
				// Octal escape \ddd
				if s[i+1] >= '0' && s[i+1] <= '7' {
					end := i + 2
					for end < i+5 && end < len(s) && s[end] >= '0' && s[end] <= '7' {
						end++
					}
					val, _ := strconv.ParseInt(s[i+1:end], 8, 16)
					sb.WriteByte(byte(val))
					i = end
					continue
				}
				sb.WriteByte(s[i+1])
			}
			i += 2
		} else {
			sb.WriteByte(s[i])
			i++
		}
	}
	return sb.String()
}

// ── EPUB parsing ──────────────────────────────────────────────────────────────

func parseEPUB(data []byte) ([]bookstore.Chapter, error) {
	zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
	if err != nil {
		return nil, fmt.Errorf("open EPUB zip: %w", err)
	}

	// 1. Read META-INF/container.xml → find rootfile (content.opf path).
	opfPath, err := epubRootfilePath(zr)
	if err != nil {
		return nil, fmt.Errorf("epub container: %w", err)
	}

	// 2. Parse content.opf → spine order of chapter files.
	spineFiles, titleMap, err := epubSpine(zr, opfPath)
	if err != nil {
		return nil, fmt.Errorf("epub spine: %w", err)
	}

	if len(spineFiles) == 0 {
		return nil, fmt.Errorf("EPUB spine is empty")
	}

	// Base directory of the OPF file for resolving relative hrefs.
	opfDir := ""
	if idx := strings.LastIndex(opfPath, "/"); idx >= 0 {
		opfDir = opfPath[:idx+1]
	}

	var chapters []bookstore.Chapter
	chNum := 0
	for i, href := range spineFiles {
		fullPath := opfDir + href
		content, err := epubFileContent(zr, fullPath)
		if err != nil {
			continue
		}
		text := htmlToText(content)
		if strings.TrimSpace(text) == "" {
			continue
		}
		chNum++
		title := titleMap[href]
		if title == "" {
			title = fmt.Sprintf("Chapter %d", chNum)
		}
		_ = i // spine index unused for numbering
		chapters = append(chapters, bookstore.Chapter{
			Number:  chNum,
			Title:   title,
			Content: text,
		})
	}

	if len(chapters) == 0 {
		return nil, fmt.Errorf("no readable chapters found in EPUB")
	}
	return chapters, nil
}

// epubRootfilePath parses META-INF/container.xml and returns the full-path
// of the OPF package document.
func epubRootfilePath(zr *zip.Reader) (string, error) {
	f := zipFile(zr, "META-INF/container.xml")
	if f == nil {
		return "", fmt.Errorf("META-INF/container.xml not found")
	}
	rc, err := f.Open()
	if err != nil {
		return "", err
	}
	defer rc.Close()

	doc, err := html.Parse(rc)
	if err != nil {
		return "", err
	}

	var path string
	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && strings.EqualFold(n.Data, "rootfile") {
			for _, a := range n.Attr {
				if strings.EqualFold(a.Key, "full-path") {
					path = a.Val
					return
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)

	if path == "" {
		return "", fmt.Errorf("rootfile full-path not found in container.xml")
	}
	return path, nil
}

// epubSpine parses the OPF document and returns the spine item hrefs in order,
// plus a map from href → nav title (if available from NCX/NAV).
func epubSpine(zr *zip.Reader, opfPath string) ([]string, map[string]string, error) {
	f := zipFile(zr, opfPath)
	if f == nil {
		return nil, nil, fmt.Errorf("OPF file %q not found in EPUB", opfPath)
	}
	rc, err := f.Open()
	if err != nil {
		return nil, nil, err
	}
	defer rc.Close()

	opfData, err := io.ReadAll(rc)
	if err != nil {
		return nil, nil, err
	}

	// Build id→href map from <manifest>.
	idToHref := make(map[string]string)
	// Also keep a href→navTitle map (populated from NCX later).
	hrefTitle := make(map[string]string)

	// Parse OPF XML with html.Parse (handles malformed XML too).
	doc, _ := html.Parse(bytes.NewReader(opfData))

	var manifestItems []struct{ id, href, mediaType string }
	var spineIdrefs []string
	var ncxID string

	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode {
			tag := strings.ToLower(n.Data)
			switch tag {
			case "item":
				var id, href, mt string
				for _, a := range n.Attr {
					switch strings.ToLower(a.Key) {
					case "id":
						id = a.Val
					case "href":
						href = a.Val
					case "media-type":
						mt = a.Val
					}
				}
				if id != "" && href != "" {
					manifestItems = append(manifestItems, struct{ id, href, mediaType string }{id, href, mt})
					idToHref[id] = href
				}
			case "itemref":
				for _, a := range n.Attr {
					if strings.ToLower(a.Key) == "idref" {
						spineIdrefs = append(spineIdrefs, a.Val)
					}
				}
			case "spine":
				for _, a := range n.Attr {
					if strings.ToLower(a.Key) == "toc" {
						ncxID = a.Val
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)

	// Build ordered spine href list.
	var spineHrefs []string
	for _, idref := range spineIdrefs {
		if href, ok := idToHref[idref]; ok {
			spineHrefs = append(spineHrefs, href)
		}
	}

	// If no explicit spine, fall back to all XHTML items in manifest order.
	if len(spineHrefs) == 0 {
		sort.Slice(manifestItems, func(i, j int) bool {
			return manifestItems[i].href < manifestItems[j].href
		})
		for _, it := range manifestItems {
			mt := strings.ToLower(it.mediaType)
			if strings.Contains(mt, "html") || strings.HasSuffix(strings.ToLower(it.href), ".html") || strings.HasSuffix(strings.ToLower(it.href), ".xhtml") {
				spineHrefs = append(spineHrefs, it.href)
			}
		}
	}

	// Try to get chapter titles from NCX (toc.ncx).
	opfDir := ""
	if idx := strings.LastIndex(opfPath, "/"); idx >= 0 {
		opfDir = opfPath[:idx+1]
	}
	if ncxHref, ok := idToHref[ncxID]; ok {
		ncxPath := opfDir + ncxHref
		if ncxFile := zipFile(zr, ncxPath); ncxFile != nil {
			if ncxRC, err := ncxFile.Open(); err == nil {
				defer ncxRC.Close()
				parseNCXTitles(ncxRC, hrefTitle)
			}
		}
	}

	return spineHrefs, hrefTitle, nil
}

// parseNCXTitles extracts navPoint label→src mappings from a toc.ncx.
func parseNCXTitles(r io.Reader, out map[string]string) {
	doc, err := html.Parse(r)
	if err != nil {
		return
	}

	// Collect navPoints: each has a <navLabel><text>…</text></navLabel> and
	// a <content src="…"/> child.
	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && strings.EqualFold(n.Data, "navpoint") {
			var label, src string
			var inner func(*html.Node)
			inner = func(c *html.Node) {
				if c.Type == html.ElementNode {
					if strings.EqualFold(c.Data, "text") && label == "" {
						if c.FirstChild != nil && c.FirstChild.Type == html.TextNode {
							label = strings.TrimSpace(c.FirstChild.Data)
						}
					}
					if strings.EqualFold(c.Data, "content") {
						for _, a := range c.Attr {
							if strings.EqualFold(a.Key, "src") {
								// Strip fragment identifier (#...).
								src = strings.SplitN(a.Val, "#", 2)[0]
							}
						}
					}
				}
				for child := c.FirstChild; child != nil; child = child.NextSibling {
					inner(child)
				}
			}
			inner(n)
			if label != "" && src != "" {
				out[src] = label
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)
}

// epubFileContent returns the raw bytes of a file inside the EPUB zip.
func epubFileContent(zr *zip.Reader, path string) ([]byte, error) {
	f := zipFile(zr, path)
	if f == nil {
		return nil, fmt.Errorf("file %q not in EPUB", path)
	}
	rc, err := f.Open()
	if err != nil {
		return nil, err
	}
	defer rc.Close()
	return io.ReadAll(rc)
}

// zipFile finds a file by name (case-insensitive) in a zip.Reader.
func zipFile(zr *zip.Reader, name string) *zip.File {
	nameLower := strings.ToLower(name)
	for _, f := range zr.File {
		if strings.ToLower(f.Name) == nameLower {
			return f
		}
	}
	return nil
}

// htmlToText converts HTML/XHTML content to plain text suitable for storage.
func htmlToText(data []byte) string {
	doc, err := html.Parse(bytes.NewReader(data))
	if err != nil {
		return string(data)
	}

	var sb strings.Builder
	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.TextNode {
			text := strings.TrimSpace(n.Data)
			if text != "" {
				sb.WriteString(text)
				sb.WriteByte(' ')
			}
		}
		if n.Type == html.ElementNode {
			switch strings.ToLower(n.Data) {
			case "p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr":
				// Block-level: ensure newline before content.
				if sb.Len() > 0 {
					s := sb.String()
					if s[len(s)-1] != '\n' {
						sb.WriteByte('\n')
					}
				}
			case "script", "style", "head":
				// Skip entirely.
				return
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
		if n.Type == html.ElementNode {
			switch strings.ToLower(n.Data) {
			case "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr":
				sb.WriteByte('\n')
			}
		}
	}
	walk(doc)

	// Collapse multiple blank lines.
	lines := strings.Split(sb.String(), "\n")
	var out []string
	blanks := 0
	for _, l := range lines {
		l = strings.TrimSpace(l)
		if l == "" {
			blanks++
			if blanks <= 1 {
				out = append(out, "")
			}
		} else {
			blanks = 0
			out = append(out, l)
		}
	}
	return strings.TrimSpace(strings.Join(out, "\n"))
}

// ── Chapter ingestion ─────────────────────────────────────────────────────────

// IngestChapters stores extracted chapters for a book.
// Each chapter is written as a markdown file in the chapters MinIO bucket
// and its index record is upserted in PocketBase via WriteChapter.
func (s *Store) IngestChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
	for _, ch := range chapters {
		var mdContent string
		if ch.Title != "" && ch.Title != fmt.Sprintf("Chapter %d", ch.Number) {
			mdContent = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
		} else {
			mdContent = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
		}
		domainCh := domain.Chapter{
			Ref:  domain.ChapterRef{Number: ch.Number, Title: ch.Title},
			Text: mdContent,
		}
		if err := s.WriteChapter(ctx, slug, domainCh); err != nil {
			return fmt.Errorf("ingest chapter %d: %w", ch.Number, err)
		}
	}
	return nil
}

// GetImportObjectKey returns the MinIO object key for an uploaded import file.
func GetImportObjectKey(filename string) string {
	return fmt.Sprintf("imports/%s", filename)
}