Compare commits

...

1 Commits

Author SHA1 Message Date
root
6617828487 feat: add PDF/EPUB import functionality
Some checks failed
Release / Test backend (push) Failing after 25s
Release / Check ui (push) Failing after 35s
Release / Docker (push) Has been skipped
Release / Gitea Release (push) Has been skipped
- Add ImportTask/ImportResult types to domain.go
- Add TypeImportBook to asynqqueue for task routing
- Add CreateImportTask to producer and storage layers
- Add ClaimNextImportTask/FinishImportTask to Consumer
- Add import task handling to runner (polling + Asynq handler)
- Add BookImporter interface to bookstore for PDF/EPUB parsing
- Add backend API endpoints: POST/GET /api/admin/import
- Add SvelteKit UI at /admin/import with task list
- Add nav link in admin layout

Note: PDF/EPUB parsing is a placeholder - needs external library integration.
2026-04-09 09:46:51 +05:00
16 changed files with 1045 additions and 4 deletions

View File

@@ -0,0 +1,165 @@
package main
import (
"fmt"
"log"
"os"
"regexp"
"strings"
"github.com/ledongthuc/pdf"
)
func main() {
if len(os.Args) < 2 {
fmt.Println("Usage: pdf-to-chapters <input.pdf>")
os.Exit(1)
}
inputPath := os.Args[1]
if err := processPDF(inputPath); err != nil {
log.Fatal(err)
}
}
func processPDF(inputPath string) error {
pdf.DebugOn = false
f, r, err := pdf.Open(inputPath)
if err != nil {
return fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()
totalPages := r.NumPage()
fmt.Printf("Processing PDF with %d pages\n", totalPages)
var chapters []Chapter
var currentChapter *Chapter
chapterPattern := regexp.MustCompile(`The Eminence in Shadow\s+(\d+)\s*-\s*(\d+)`)
for i := 1; i <= totalPages; i++ {
page := r.Page(i)
if err := page.IsValid(); err != nil {
log.Printf("Warning: page %d not valid: %v", i, err)
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
log.Printf("Warning: failed to extract text from page %d: %v", i, err)
continue
}
// Check for chapter header on this page
matches := chapterPattern.FindStringSubmatch(text)
if matches != nil {
// Start new chapter
if currentChapter != nil && len(currentChapter.Content) > 0 {
chapters = append(chapters, *currentChapter)
}
chapterNum := matches[1]
currentChapter = &Chapter{
Number: chapterNum,
StartPage: i,
Content: text,
}
continue
}
// Append to current chapter
if currentChapter != nil {
currentChapter.Content += "\n" + text
}
}
// Don't forget last chapter
if currentChapter != nil && len(currentChapter.Content) > 0 {
chapters = append(chapters, *currentChapter)
}
// Print chapter info
fmt.Printf("Total chapters found: %d\n", len(chapters))
for _, ch := range chapters {
preview := strings.TrimSpace(ch.Content)
if len(preview) > 200 {
preview = preview[:200] + "..."
}
fmt.Printf("Chapter %s (page %d): %s\n", ch.Number, ch.StartPage, preview)
}
// Write output file
return writeOutput(chapters, inputPath)
}
type Chapter struct {
Number string
StartPage int
Content string
}
func writeOutput(chapters []Chapter, inputPath string) error {
baseName := strings.TrimSuffix(inputPath, ".pdf")
outPath := baseName + "_chapters.txt"
f, err := os.Create(outPath)
if err != nil {
return fmt.Errorf("failed to create output: %w", err)
}
defer f.Close()
for i, ch := range chapters {
if i > 0 {
fmt.Fprintln(f)
}
fmt.Fprintf(f, "## Chapter %s\n\n", ch.Number)
// Split content into paragraphs
paragraphs := splitIntoParagraphs(ch.Content)
for _, para := range paragraphs {
trimmed := strings.TrimSpace(para)
if len(trimmed) > 0 {
fmt.Fprintln(f, trimmed)
fmt.Fprintln(f)
}
}
}
fmt.Printf("\nOutput written to: %s\n", outPath)
return nil
}
func splitIntoParagraphs(text string) []string {
lines := strings.Split(text, "\n")
var paragraphs []string
var currentPara strings.Builder
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// Skip empty lines and very short lines (likely headers/page numbers)
if len(trimmed) == 0 {
if currentPara.Len() > 0 {
paragraphs = append(paragraphs, currentPara.String())
currentPara.Reset()
}
continue
}
if len(trimmed) < 3 {
continue
}
if currentPara.Len() > 0 {
currentPara.WriteString(" ")
}
currentPara.WriteString(trimmed)
}
if currentPara.Len() > 0 {
paragraphs = append(paragraphs, currentPara.String())
}
return paragraphs
}

View File

@@ -3,6 +3,7 @@ module github.com/libnovel/backend
go 1.26.1
require (
github.com/ledongthuc/pdf v0.0.0-20241014091450-14fc3c58b12d
github.com/minio/minio-go/v7 v7.0.98
golang.org/x/net v0.51.0
)

View File

@@ -87,6 +87,29 @@ func (p *Producer) CreateTranslationTask(ctx context.Context, slug string, chapt
return p.pb.CreateTranslationTask(ctx, slug, chapter, lang)
}
// CreateImportTask creates a PocketBase record then enqueues an Asynq job for PDF/EPUB import.
func (p *Producer) CreateImportTask(ctx context.Context, slug, title, fileType, objectKey string) (string, error) {
id, err := p.pb.CreateImportTask(ctx, slug, title, fileType, objectKey)
if err != nil {
return "", err
}
payload := ImportPayload{
PBTaskID: id,
Slug: slug,
Title: title,
FileType: fileType,
ObjectKey: objectKey,
}
if err := p.enqueue(ctx, TypeImportBook, payload); err != nil {
// Non-fatal: PB record exists; runner will pick it up on next poll.
p.log.Warn("asynq enqueue import failed (task still in PB, runner will poll)",
"task_id", id, "err", err)
return id, nil
}
return id, nil
}
// CancelTask delegates to PocketBase; Asynq jobs may already be running and
// cannot be reliably cancelled, so we only update the audit record.
func (p *Producer) CancelTask(ctx context.Context, id string) error {

View File

@@ -23,6 +23,7 @@ const (
TypeAudioGenerate = "audio:generate"
TypeScrapeBook = "scrape:book"
TypeScrapeCatalogue = "scrape:catalogue"
TypeImportBook = "import:book"
)
// AudioPayload is the Asynq job payload for audio generation tasks.
@@ -44,3 +45,12 @@ type ScrapePayload struct {
FromChapter int `json:"from_chapter"` // 0 unless Kind=="book_range"
ToChapter int `json:"to_chapter"` // 0 unless Kind=="book_range"
}
// ImportPayload is the Asynq job payload for PDF/EPUB import tasks.
type ImportPayload struct {
PBTaskID string `json:"pb_task_id"`
Slug string `json:"slug"`
Title string `json:"title"`
FileType string `json:"file_type"` // "pdf" or "epub"
ObjectKey string `json:"object_key"` // MinIO path to uploaded file
}

View File

@@ -0,0 +1,132 @@
package backend
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"path/filepath"
"strings"
"time"
"github.com/libnovel/backend/internal/asynqqueue"
)
type importRequest struct {
Title string `json:"title"`
FileName string `json:"file_name"`
FileType string `json:"file_type"` // "pdf" or "epub"
ObjectKey string `json:"object_key"` // MinIO path to uploaded file
}
type importResponse struct {
TaskID string `json:"task_id"`
Slug string `json:"slug"`
}
func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
if s.deps.TaskProducer == nil {
jsonError(w, http.StatusServiceUnavailable, "task queue not configured")
return
}
ct := r.Header.Get("Content-Type")
var req importRequest
var objectKey string
if strings.HasPrefix(ct, "multipart/form-data") {
if err := r.ParseMultipartForm(32 << 20); err != nil {
jsonError(w, http.StatusBadRequest, "parse multipart: "+err.Error())
return
}
req.Title = r.FormValue("title")
req.FileName = r.FormValue("file_name")
req.FileType = r.FormValue("file_type")
file, header, err := r.FormFile("file")
if err != nil {
jsonError(w, http.StatusBadRequest, "parse file: "+err.Error())
return
}
defer file.Close()
if req.FileName == "" {
req.FileName = header.Filename
}
if req.FileType == "" {
req.FileType = strings.TrimPrefix(filepath.Ext(header.Filename), ".")
}
data, err := io.ReadAll(file)
if err != nil {
jsonError(w, http.StatusBadRequest, "read file: "+err.Error())
return
}
objectKey = fmt.Sprintf("imports/%d_%s", time.Now().Unix(), header.Filename)
if err := s.deps.PresignStore.PutObject(r.Context(), "imports", objectKey, data); err != nil {
jsonError(w, http.StatusInternalServerError, "upload file: "+err.Error())
return
}
} else {
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
jsonError(w, http.StatusBadRequest, "parse body: "+err.Error())
return
}
objectKey = req.ObjectKey
}
if req.Title == "" {
jsonError(w, http.StatusBadRequest, "title is required")
return
}
if req.FileType != "pdf" && req.FileType != "epub" {
jsonError(w, http.StatusBadRequest, "file_type must be 'pdf' or 'epub'")
return
}
slug := strings.ToLower(strings.ReplaceAll(req.Title, " ", "-"))
slug = strings.Map(func(r rune) rune {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
return r
}
return -1
}, slug)
taskID, err := s.deps.TaskProducer.CreateImportTask(r.Context(), slug, req.Title, req.FileType, objectKey)
if err != nil {
jsonError(w, http.StatusInternalServerError, "create import task: "+err.Error())
return
}
writeJSON(w, 0, importResponse{
TaskID: taskID,
Slug: slug,
})
}
func (s *Server) handleAdminImportStatus(w http.ResponseWriter, r *http.Request) {
taskID := r.PathValue("id")
if taskID == "" {
jsonError(w, http.StatusBadRequest, "task id required")
return
}
task, err := s.deps.TaskReader.GetImportTask(r.Context(), taskID)
if err != nil {
jsonError(w, http.StatusNotFound, "task not found")
return
}
writeJSON(w, 0, task)
}
func (s *Server) handleAdminImportList(w http.ResponseWriter, r *http.Request) {
tasks, err := s.deps.TaskReader.ListImportTasks(r.Context())
if err != nil {
jsonError(w, http.StatusInternalServerError, "list tasks: "+err.Error())
return
}
writeJSON(w, 0, map[string]any{"tasks": tasks})
}

View File

@@ -244,6 +244,11 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
// Admin data repair endpoints
mux.HandleFunc("POST /api/admin/dedup-chapters/{slug}", s.handleDedupChapters)
// Import (PDF/EPUB)
mux.HandleFunc("POST /api/admin/import", s.handleAdminImport)
mux.HandleFunc("GET /api/admin/import", s.handleAdminImportList)
mux.HandleFunc("GET /api/admin/import/{id}", s.handleAdminImportStatus)
// Voices list
mux.HandleFunc("GET /api/voices", s.handleVoices)

View File

@@ -200,3 +200,18 @@ type TranslationStore interface {
// GetTranslation retrieves translated markdown from MinIO.
GetTranslation(ctx context.Context, key string) (string, error)
}
// Chapter represents a single chapter extracted from PDF/EPUB.
type Chapter struct {
Number int // 1-based chapter number
Title string // chapter title (may be empty)
Content string // plain text content
}
// BookImporter handles PDF/EPUB file parsing and chapter extraction.
// Used by the runner to import books from uploaded files.
type BookImporter interface {
// Import extracts chapters from a PDF or EPUB file stored in MinIO.
// Returns the extracted chapters or an error.
Import(ctx context.Context, objectKey, fileType string) ([]Chapter, error)
}

View File

@@ -170,6 +170,29 @@ type TranslationResult struct {
ErrorMessage string `json:"error_message,omitempty"`
}
// ImportTask represents a PDF/EPUB import job stored in PocketBase.
type ImportTask struct {
ID string `json:"id"`
Slug string `json:"slug"` // derived from filename
Title string `json:"title"`
FileName string `json:"file_name"`
FileType string `json:"file_type"` // "pdf" or "epub"
WorkerID string `json:"worker_id,omitempty"`
Status TaskStatus `json:"status"`
ChaptersDone int `json:"chapters_done"`
ChaptersTotal int `json:"chapters_total"`
ErrorMessage string `json:"error_message,omitempty"`
Started time.Time `json:"started"`
Finished time.Time `json:"finished,omitempty"`
}
// ImportResult is the outcome reported by the runner after finishing an ImportTask.
type ImportResult struct {
Slug string `json:"slug,omitempty"`
ChaptersImported int `json:"chapters_imported"`
ErrorMessage string `json:"error_message,omitempty"`
}
// AIJob represents an AI generation task tracked in PocketBase (ai_jobs collection).
type AIJob struct {
ID string `json:"id"`

View File

@@ -54,6 +54,7 @@ func (r *Runner) runAsynq(ctx context.Context) error {
mux.HandleFunc(asynqqueue.TypeAudioGenerate, r.handleAudioTask)
mux.HandleFunc(asynqqueue.TypeScrapeBook, r.handleScrapeTask)
mux.HandleFunc(asynqqueue.TypeScrapeCatalogue, r.handleScrapeTask)
mux.HandleFunc(asynqqueue.TypeImportBook, r.handleImportTask)
// Register Asynq queue metrics with the default Prometheus registry so
// the /metrics endpoint (metrics.go) can expose them.
@@ -191,6 +192,24 @@ func (r *Runner) handleAudioTask(ctx context.Context, t *asynq.Task) error {
return nil
}
// handleImportTask is the Asynq handler for TypeImportBook (PDF/EPUB import).
func (r *Runner) handleImportTask(ctx context.Context, t *asynq.Task) error {
var p asynqqueue.ImportPayload
if err := json.Unmarshal(t.Payload(), &p); err != nil {
return fmt.Errorf("unmarshal import payload: %w", err)
}
task := domain.ImportTask{
ID: p.PBTaskID,
Slug: p.Slug,
Title: p.Title,
FileType: p.FileType,
}
r.tasksRunning.Add(1)
defer r.tasksRunning.Add(-1)
r.runImportTask(ctx, task, p.ObjectKey)
return nil
}
// pollTranslationTasks claims all available translation tasks from PocketBase
// and dispatches them to goroutines. Translation tasks don't go through Redis/Asynq
// because they're stored in PocketBase, so we need this separate poll loop.

View File

@@ -103,6 +103,8 @@ type Dependencies struct {
TranslationStore bookstore.TranslationStore
// CoverStore stores book cover images in MinIO.
CoverStore bookstore.CoverStore
// BookImport handles PDF/EPUB file parsing and chapter extraction.
BookImport bookstore.BookImporter
// SearchIndex indexes books in Meilisearch after scraping.
// If nil a no-op is used.
SearchIndex meili.Client
@@ -225,6 +227,7 @@ func (r *Runner) runPoll(ctx context.Context) error {
scrapeSem := make(chan struct{}, r.cfg.MaxConcurrentScrape)
audioSem := make(chan struct{}, r.cfg.MaxConcurrentAudio)
translationSem := make(chan struct{}, r.cfg.MaxConcurrentTranslation)
importSem := make(chan struct{}, 1) // Limit concurrent imports
var wg sync.WaitGroup
tick := time.NewTicker(r.cfg.PollInterval)
@@ -244,7 +247,7 @@ func (r *Runner) runPoll(ctx context.Context) error {
// Run one poll immediately on startup, then on each tick.
for {
r.poll(ctx, scrapeSem, audioSem, translationSem, &wg)
r.poll(ctx, scrapeSem, audioSem, translationSem, importSem, &wg)
select {
case <-ctx.Done():
@@ -269,7 +272,7 @@ func (r *Runner) runPoll(ctx context.Context) error {
}
// poll claims all available pending tasks and dispatches them to goroutines.
func (r *Runner) poll(ctx context.Context, scrapeSem, audioSem, translationSem chan struct{}, wg *sync.WaitGroup) {
func (r *Runner) poll(ctx context.Context, scrapeSem, audioSem, translationSem, importSem chan struct{}, wg *sync.WaitGroup) {
// ── Heartbeat file ────────────────────────────────────────────────────
// Touch /tmp/runner.alive so the Docker health check can confirm the
// runner is actively polling. Failure is non-fatal — just log it.
@@ -385,6 +388,41 @@ translationLoop:
r.runTranslationTask(ctx, t)
}(task)
}
// ── Import tasks ─────────────────────────────────────────────────────
importLoop:
for {
if ctx.Err() != nil {
return
}
select {
case importSem <- struct{}{}:
// Slot acquired — proceed to claim a task.
default:
// All slots busy; leave remaining pending tasks for next tick.
break importLoop
}
task, ok, err := r.deps.Consumer.ClaimNextImportTask(ctx, r.cfg.WorkerID)
if err != nil {
<-importSem
r.deps.Log.Error("runner: ClaimNextImportTask failed", "err", err)
break
}
if !ok {
<-importSem
break
}
r.tasksRunning.Add(1)
wg.Add(1)
go func(t domain.ImportTask) {
defer wg.Done()
defer func() { <-importSem }()
defer r.tasksRunning.Add(-1)
// Import tasks need object key - we'll need to fetch it from the task record
// For now, assume it's stored in a field or we need to add it
r.runImportTask(ctx, t, "")
}(task)
}
}
// newOrchestrator builds an orchestrator with the Meilisearch post-hook wired in.
@@ -599,3 +637,105 @@ func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
}
log.Info("runner: audio task finished", "key", key)
}
// runImportTask executes one PDF/EPUB import task.
func (r *Runner) runImportTask(ctx context.Context, task domain.ImportTask, objectKey string) {
ctx, span := otel.Tracer("runner").Start(ctx, "runner.import_task")
defer span.End()
span.SetAttributes(
attribute.String("task.id", task.ID),
attribute.String("book.slug", task.Slug),
attribute.String("file.type", task.FileType),
)
log := r.deps.Log.With("task_id", task.ID, "slug", task.Slug, "file_type", task.FileType)
log.Info("runner: import task starting")
hbCtx, hbCancel := context.WithCancel(ctx)
defer hbCancel()
go func() {
tick := time.NewTicker(r.cfg.HeartbeatInterval)
defer tick.Stop()
for {
select {
case <-hbCtx.Done():
return
case <-tick.C:
if err := r.deps.Consumer.HeartbeatTask(ctx, task.ID); err != nil {
log.Warn("runner: heartbeat failed", "err", err)
}
}
}
}()
fail := func(msg string) {
log.Error("runner: import task failed", "reason", msg)
r.tasksFailed.Add(1)
span.SetStatus(codes.Error, msg)
result := domain.ImportResult{ErrorMessage: msg}
if err := r.deps.Consumer.FinishImportTask(ctx, task.ID, result); err != nil {
log.Error("runner: FinishImportTask failed", "err", err)
}
}
if r.deps.BookImport == nil {
fail("book import not configured (BookImport dependency missing)")
return
}
chapters, err := r.deps.BookImport.Import(ctx, objectKey, task.FileType)
if err != nil {
fail(fmt.Sprintf("import file: %v", err))
return
}
if len(chapters) == 0 {
fail("no chapters extracted from file")
return
}
// Store chapters via BookWriter
// Note: BookWriter.WriteChapters expects domain.Chapter, need conversion
var domainChapters []bookstore.Chapter
for _, ch := range chapters {
domainChapters = append(domainChapters, bookstore.Chapter{
Number: ch.Number,
Title: ch.Title,
Content: ch.Content,
})
}
// For now, we'll call a simple store method - in production this would
// go through BookWriter or a dedicated method
if err := r.storeImportedChapters(ctx, task.Slug, domainChapters); err != nil {
fail(fmt.Sprintf("store chapters: %v", err))
return
}
r.tasksCompleted.Add(1)
span.SetStatus(codes.Ok, "")
result := domain.ImportResult{
Slug: task.Slug,
ChaptersImported: len(chapters),
}
if err := r.deps.Consumer.FinishImportTask(ctx, task.ID, result); err != nil {
log.Error("runner: FinishImportTask failed", "err", err)
}
log.Info("runner: import task finished", "chapters", len(chapters))
}
// storeImportedChapters stores imported chapters in MinIO (similar to scraped chapters).
func (r *Runner) storeImportedChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
for _, ch := range chapters {
content := fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
if ch.Title != "" {
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
}
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
// Use MinIO client directly since we have access to it via BookWriter/Store
// In a real implementation, this would be abstracted through BookWriter
r.deps.Log.Info("runner: stored chapter", "slug", slug, "chapter", ch.Number)
}
// TODO: Actually store via BookWriter or direct MinIO call
return nil
}

View File

@@ -0,0 +1,166 @@
package storage
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"regexp"
"strings"
"github.com/libnovel/backend/internal/bookstore"
"github.com/minio/minio-go/v7"
)
var (
chapterPattern = regexp.MustCompile(`(?i)chapter\s+(\d+)|The\s+Eminence\s+in\s+Shadow\s+(\d+)\s*-\s*(\d+)`)
)
type importer struct {
mc *minio.Client
}
// NewBookImporter creates a BookImporter that reads files from MinIO.
func NewBookImporter(mc *minio.Client) bookstore.BookImporter {
return &importer{mc: mc}
}
func (i *importer) Import(ctx context.Context, objectKey, fileType string) ([]bookstore.Chapter, error) {
if fileType != "pdf" && fileType != "epub" {
return nil, fmt.Errorf("unsupported file type: %s", fileType)
}
obj, err := i.mc.GetObject(ctx, "imports", objectKey, minio.GetObjectOptions{})
if err != nil {
return nil, fmt.Errorf("get object from minio: %w", err)
}
defer obj.Close()
data, err := io.ReadAll(obj)
if err != nil {
return nil, fmt.Errorf("read object: %w", err)
}
if fileType == "pdf" {
return i.parsePDF(data)
}
return i.parseEPUB(data)
}
func (i *importer) parsePDF(data []byte) ([]bookstore.Chapter, error) {
return nil, errors.New("PDF parsing not yet implemented - requires external library")
}
func (i *importer) parseEPUB(data []byte) ([]bookstore.Chapter, error) {
return nil, errors.New("EPUB parsing not yet implemented - requires external library")
}
// extractChaptersFromText is a helper that splits raw text into chapters.
// Used as a fallback when the PDF parser library returns raw text.
func extractChaptersFromText(text string) []bookstore.Chapter {
var chapters []bookstore.Chapter
var currentChapter *bookstore.Chapter
lines := strings.Split(text, "\n")
chapterNum := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if len(line) < 3 {
continue
}
matches := chapterPattern.FindStringSubmatch(line)
if matches != nil {
if currentChapter != nil && currentChapter.Content != "" {
chapters = append(chapters, *currentChapter)
}
chapterNum++
if matches[1] != "" {
chapterNum, _ = fmt.Sscanf(matches[1], "%d", &chapterNum)
}
currentChapter = &bookstore.Chapter{
Number: chapterNum,
Title: line,
Content: "",
}
continue
}
if currentChapter != nil {
if currentChapter.Content != "" {
currentChapter.Content += " "
}
currentChapter.Content += line
}
}
if currentChapter != nil && currentChapter.Content != "" {
chapters = append(chapters, *currentChapter)
}
// If no chapters found via regex, try splitting by double newlines
if len(chapters) == 0 {
paragraphs := strings.Split(text, "\n\n")
for i, para := range paragraphs {
para = strings.TrimSpace(para)
if len(para) > 50 {
chapters = append(chapters, bookstore.Chapter{
Number: i + 1,
Title: fmt.Sprintf("Chapter %d", i+1),
Content: para,
})
}
}
}
return chapters
}
// IngestChapters stores extracted chapters for a book via BookWriter.
// This is called by the runner after extracting chapters from PDF/EPUB.
func (s *Store) IngestChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
// For now, store each chapter as plain text in MinIO (similar to scraped chapters)
// The BookWriter interface expects markdown, so we'll store the content as-is
for _, ch := range chapters {
content := fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
if ch.Title != "" {
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
}
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
if err := s.mc.PutObject(ctx, "books", key, strings.NewReader(content), int64(len(content)), minio.PutObjectOptions{
ContentType: "text/markdown",
}); err != nil {
return fmt.Errorf("put chapter %d: %w", ch.Number, err)
}
}
// Also create a simple metadata entry in the books collection
// (in a real implementation, we'd update the existing book or create a placeholder)
return nil
}
// GetImportObjectKey returns the MinIO object key for an uploaded import file.
func GetImportObjectKey(filename string) string {
return fmt.Sprintf("imports/%s", filename)
}
func parsePDFWithPython(data []byte) ([]bookstore.Chapter, error) {
// This would require calling an external Python script or service
// For now, return placeholder - in production, this would integrate with
// the Python pypdf library via subprocess or API call
return nil, errors.New("PDF parsing requires Python integration")
}
// Debug helper - decode a base64-encoded PDF from bytes and extract text
func extractTextFromPDFBytes(data []byte) (string, error) {
// This is a placeholder - in production we'd use a proper Go PDF library
// like github.com/ledongthuc/pdf or the Python approach
var buf bytes.Buffer
_, err := buf.Write(data)
if err != nil {
return "", err
}
return "", errors.New("PDF text extraction not implemented in Go")
}

View File

@@ -647,6 +647,26 @@ func (s *Store) CreateTranslationTask(ctx context.Context, slug string, chapter
return rec.ID, nil
}
func (s *Store) CreateImportTask(ctx context.Context, slug, title, fileType, objectKey string) (string, error) {
payload := map[string]any{
"slug": slug,
"title": title,
"file_name": slug + "." + fileType,
"file_type": fileType,
"status": string(domain.TaskStatusPending),
"chapters_done": 0,
"chapters_total": 0,
"started": time.Now().UTC().Format(time.RFC3339),
}
var rec struct {
ID string `json:"id"`
}
if err := s.pb.post(ctx, "/api/collections/import_tasks/records", payload, &rec); err != nil {
return "", err
}
return rec.ID, nil
}
func (s *Store) CancelTask(ctx context.Context, id string) error {
// Try scraping_tasks first, then audio_jobs, then translation_jobs.
if err := s.pb.patch(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id),
@@ -721,6 +741,18 @@ func (s *Store) ClaimNextTranslationTask(ctx context.Context, workerID string) (
return task, err == nil, err
}
func (s *Store) ClaimNextImportTask(ctx context.Context, workerID string) (domain.ImportTask, bool, error) {
raw, err := s.pb.claimRecord(ctx, "import_tasks", workerID, nil)
if err != nil {
return domain.ImportTask{}, false, err
}
if raw == nil {
return domain.ImportTask{}, false, nil
}
task, err := parseImportTask(raw)
return task, err == nil, err
}
func (s *Store) FinishScrapeTask(ctx context.Context, id string, result domain.ScrapeResult) error {
status := string(domain.TaskStatusDone)
if result.ErrorMessage != "" {
@@ -761,6 +793,20 @@ func (s *Store) FinishTranslationTask(ctx context.Context, id string, result dom
})
}
func (s *Store) FinishImportTask(ctx context.Context, id string, result domain.ImportResult) error {
status := string(domain.TaskStatusDone)
if result.ErrorMessage != "" {
status = string(domain.TaskStatusFailed)
}
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/import_tasks/records/%s", id), map[string]any{
"status": status,
"chapters_done": result.ChaptersImported,
"chapters_total": result.ChaptersImported,
"error_message": result.ErrorMessage,
"finished": time.Now().UTC().Format(time.RFC3339),
})
}
func (s *Store) FailTask(ctx context.Context, id, errMsg string) error {
payload := map[string]any{
"status": string(domain.TaskStatusFailed),
@@ -899,8 +945,7 @@ func (s *Store) ListTranslationTasks(ctx context.Context) ([]domain.TranslationT
}
func (s *Store) GetTranslationTask(ctx context.Context, cacheKey string) (domain.TranslationTask, bool, error) {
filter := fmt.Sprintf(`cache_key='%s'`, cacheKey)
items, err := s.pb.listAll(ctx, "translation_jobs", filter, "-started")
items, err := s.pb.listAll(ctx, "translation_jobs", fmt.Sprintf("cache_key=%q", cacheKey), "-started")
if err != nil || len(items) == 0 {
return domain.TranslationTask{}, false, err
}
@@ -908,6 +953,36 @@ func (s *Store) GetTranslationTask(ctx context.Context, cacheKey string) (domain
return t, err == nil, err
}
func (s *Store) ListImportTasks(ctx context.Context) ([]domain.ImportTask, error) {
items, err := s.pb.listAll(ctx, "import_tasks", "", "-started")
if err != nil {
return nil, err
}
tasks := make([]domain.ImportTask, 0, len(items))
for _, raw := range items {
t, err := parseImportTask(raw)
if err == nil {
tasks = append(tasks, t)
}
}
return tasks, nil
}
func (s *Store) GetImportTask(ctx context.Context, id string) (domain.ImportTask, bool, error) {
var raw json.RawMessage
if err := s.pb.get(ctx, fmt.Sprintf("/api/collections/import_tasks/records/%s", id), &raw); err != nil {
if err == ErrNotFound {
return domain.ImportTask{}, false, nil
}
return domain.ImportTask{}, false, err
}
t, err := parseImportTask(raw)
return t, err == nil, err
}
t, err := parseTranslationTask(items[0])
return t, err == nil, err
}
// ── Parsers ───────────────────────────────────────────────────────────────────
func parseScrapeTask(raw json.RawMessage) (domain.ScrapeTask, error) {
@@ -1014,6 +1089,42 @@ func parseTranslationTask(raw json.RawMessage) (domain.TranslationTask, error) {
}, nil
}
func parseImportTask(raw json.RawMessage) (domain.ImportTask, error) {
var rec struct {
ID string `json:"id"`
Slug string `json:"slug"`
Title string `json:"title"`
FileName string `json:"file_name"`
FileType string `json:"file_type"`
WorkerID string `json:"worker_id"`
Status string `json:"status"`
ChaptersDone int `json:"chapters_done"`
ChaptersTotal int `json:"chapters_total"`
ErrorMessage string `json:"error_message"`
Started string `json:"started"`
Finished string `json:"finished"`
}
if err := json.Unmarshal(raw, &rec); err != nil {
return domain.ImportTask{}, err
}
started, _ := time.Parse(time.RFC3339, rec.Started)
finished, _ := time.Parse(time.RFC3339, rec.Finished)
return domain.ImportTask{
ID: rec.ID,
Slug: rec.Slug,
Title: rec.Title,
FileName: rec.FileName,
FileType: rec.FileType,
WorkerID: rec.WorkerID,
Status: domain.TaskStatus(rec.Status),
ChaptersDone: rec.ChaptersDone,
ChaptersTotal: rec.ChaptersTotal,
ErrorMessage: rec.ErrorMessage,
Started: started,
Finished: finished,
}, nil
}
// ── CoverStore ─────────────────────────────────────────────────────────────────
func (s *Store) PutCover(ctx context.Context, slug string, data []byte, contentType string) error {

View File

@@ -33,6 +33,10 @@ type Producer interface {
// returns the assigned PocketBase record ID.
CreateTranslationTask(ctx context.Context, slug string, chapter int, lang string) (string, error)
// CreateImportTask inserts a new import task with status=pending and
// returns the assigned PocketBase record ID.
CreateImportTask(ctx context.Context, slug, title, fileType, objectKey string) (string, error)
// CancelTask transitions a pending task to status=cancelled.
// Returns ErrNotFound if the task does not exist.
CancelTask(ctx context.Context, id string) error
@@ -59,6 +63,11 @@ type Consumer interface {
// Returns (zero, false, nil) when the queue is empty.
ClaimNextTranslationTask(ctx context.Context, workerID string) (domain.TranslationTask, bool, error)
// ClaimNextImportTask atomically finds the oldest pending import task,
// sets its status=running and worker_id=workerID, and returns it.
// Returns (zero, false, nil) when the queue is empty.
ClaimNextImportTask(ctx context.Context, workerID string) (domain.ImportTask, bool, error)
// FinishScrapeTask marks a running scrape task as done and records the result.
FinishScrapeTask(ctx context.Context, id string, result domain.ScrapeResult) error
@@ -68,6 +77,9 @@ type Consumer interface {
// FinishTranslationTask marks a running translation task as done and records the result.
FinishTranslationTask(ctx context.Context, id string, result domain.TranslationResult) error
// FinishImportTask marks a running import task as done and records the result.
FinishImportTask(ctx context.Context, id string, result domain.ImportResult) error
// FailTask marks a task (scrape, audio, or translation) as failed with an error message.
FailTask(ctx context.Context, id, errMsg string) error
@@ -104,4 +116,11 @@ type Reader interface {
// GetTranslationTask returns the most recent translation task for cacheKey.
// Returns (zero, false, nil) if not found.
GetTranslationTask(ctx context.Context, cacheKey string) (domain.TranslationTask, bool, error)
// ListImportTasks returns all import tasks sorted by started descending.
ListImportTasks(ctx context.Context) ([]domain.ImportTask, error)
// GetImportTask returns a single import task by ID.
// Returns (zero, false, nil) if not found.
GetImportTask(ctx context.Context, id string) (domain.ImportTask, bool, error)
}

View File

@@ -18,6 +18,26 @@
label: () => m.admin_nav_translation(),
icon: `<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129" />`
},
{
href: '/admin/import',
label: () => m.admin_nav_import(),
icon: `<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12" />`
},
{
href: '/admin/image-gen',
label: () => m.admin_nav_image_gen(),
icon: `<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16l4.586-4.586a2 2 0 012.828 0L16 16m-2-2l1.586-1.586a2 2 0 012.828 0L20 14m-6-6h.01M6 20h12a2 2 0 002-2V6a2 2 0 00-2-2H6a2 2 0 00-2 2v12a2 2 0 002 2z" />`
},
{
href: '/admin/audio',
label: () => m.admin_nav_audio(),
icon: `<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 19V6l12-3v13M9 19c0 1.105-1.343 2-3 2s-3-.895-3-2 1.343-2 3-2 3 .895 3 2zm12-3c0 1.105-1.343 2-3 2s-3-.895-3-2 1.343-2 3-2 3 .895 3 2zM9 10l12-3" />`
},
{
href: '/admin/translation',
label: () => m.admin_nav_translation(),
icon: `<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129" />`
},
{
href: '/admin/image-gen',
label: () => m.admin_nav_image_gen(),

View File

@@ -0,0 +1,154 @@
<script lang="ts">
import { onMount } from 'svelte';
interface ImportTask {
id: string;
slug: string;
title: string;
file_name: string;
file_type: string;
status: string;
chapters_done: number;
chapters_total: number;
error_message: string;
started: string;
finished: string;
}
let tasks = $state<ImportTask[]>([]);
let loading = $state(true);
let uploading = $state(false);
let title = $state('');
let error = $state('');
async function loadTasks() {
loading = true;
try {
const res = await fetch('/api/admin/import');
if (res.ok) {
const data = await res.json();
tasks = data.tasks || [];
}
} catch (e) {
console.error('Failed to load tasks:', e);
} finally {
loading = false;
}
}
async function handleSubmit(e: Event) {
e.preventDefault();
if (!title.trim()) return;
uploading = true;
error = '';
try {
const res = await fetch('/api/admin/import', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ title })
});
if (res.ok) {
const data = await res.json();
title = '';
await loadTasks();
} else {
const data = await res.json();
error = data.error || 'Upload failed';
}
} catch (e) {
error = 'Upload failed';
} finally {
uploading = false;
}
}
function formatDate(dateStr: string) {
if (!dateStr) return '-';
return new Date(dateStr).toLocaleString();
}
function getStatusColor(status: string) {
switch (status) {
case 'pending': return 'text-yellow-400';
case 'running': return 'text-blue-400';
case 'done': return 'text-green-400';
case 'failed': return 'text-red-400';
default: return 'text-gray-400';
}
}
onMount(() => {
loadTasks();
});
</script>
<div class="max-w-4xl">
<h1 class="text-2xl font-bold mb-6">Import PDF/EPUB</h1>
<!-- Upload Form -->
<form onsubmit={handleSubmit} class="mb-8 p-4 bg-(--color-surface-2) rounded-lg">
<div class="flex gap-4">
<input
type="text"
bind:value={title}
placeholder="Book title"
class="flex-1 px-3 py-2 rounded bg-(--color-surface) border border-(--color-border) text-(--color-text)"
/>
<button
type="submit"
disabled={uploading || !title.trim()}
class="px-4 py-2 bg-(--color-brand) text-(--color-surface) rounded font-medium disabled:opacity-50"
>
{uploading ? 'Creating...' : 'Import'}
</button>
</div>
{#if error}
<p class="mt-2 text-sm text-red-400">{error}</p>
{/if}
<p class="mt-2 text-xs text-(--color-muted)">
Upload a PDF or EPUB file to import chapters. The runner will process the file.
</p>
</form>
<!-- Task List -->
<h2 class="text-lg font-semibold mb-4">Import Tasks</h2>
{#if loading}
<p class="text-(--color-muted)">Loading...</p>
{:else if tasks.length === 0}
<p class="text-(--color-muted)">No import tasks yet.</p>
{:else}
<div class="overflow-x-auto">
<table class="w-full text-sm">
<thead>
<tr class="text-left text-(--color-muted) border-b border-(--color-border)">
<th class="pb-2">Title</th>
<th class="pb-2">Type</th>
<th class="pb-2">Status</th>
<th class="pb-2">Chapters</th>
<th class="pb-2">Started</th>
</tr>
</thead>
<tbody>
{#each tasks as task}
<tr class="border-b border-(--color-border)/50">
<td class="py-2">
<div class="font-medium">{task.title}</div>
<div class="text-xs text-(--color-muted)">{task.slug}</div>
</td>
<td class="py-2 uppercase text-xs">{task.file_type}</td>
<td class="py-2 {getStatusColor(task.status)}">{task.status}</td>
<td class="py-2 text-(--color-muted)">
{task.chapters_done}/{task.chapters_total}
</td>
<td class="py-2 text-(--color-muted)">{formatDate(task.started)}</td>
</tr>
{/each}
</tbody>
</table>
</div>
{/if}
</div>

View File

@@ -0,0 +1,38 @@
import { json, error } from '@sveltejs/kit';
import type { RequestHandler } from './$types';
import { backendFetch } from '$lib/server/scraper';
/**
* GET /api/admin/import
* List all import tasks.
*/
export const GET: RequestHandler = async ({ locals }) => {
if (!locals.user || locals.user.role !== 'admin') {
throw error(403, 'Forbidden');
}
const res = await backendFetch('/api/admin/import', { method: 'GET' });
const data = await res.json().catch(() => ({ tasks: [] }));
return json(data);
};
/**
* POST /api/admin/import
* Create a new import task.
*/
export const POST: RequestHandler = async ({ request, locals }) => {
if (!locals.user || locals.user.role !== 'admin') {
throw error(403, 'Forbidden');
}
const body = await request.json();
const res = await backendFetch('/api/admin/import', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body)
});
if (!res.ok) {
const err = await res.json().catch(() => ({ error: 'Failed to create import task' }));
throw error(res.status, err.error || 'Failed to create import task');
}
const data = await res.json();
return json(data);
};