Compare commits

..

1 Commits

Author SHA1 Message Date
root
6617828487 feat: add PDF/EPUB import functionality
Some checks failed
Release / Test backend (push) Failing after 25s
Release / Check ui (push) Failing after 35s
Release / Docker (push) Has been skipped
Release / Gitea Release (push) Has been skipped
- Add ImportTask/ImportResult types to domain.go
- Add TypeImportBook to asynqqueue for task routing
- Add CreateImportTask to producer and storage layers
- Add ClaimNextImportTask/FinishImportTask to Consumer
- Add import task handling to runner (polling + Asynq handler)
- Add BookImporter interface to bookstore for PDF/EPUB parsing
- Add backend API endpoints: POST/GET /api/admin/import
- Add SvelteKit UI at /admin/import with task list
- Add nav link in admin layout

Note: PDF/EPUB parsing is a placeholder - needs external library integration.
2026-04-09 09:46:51 +05:00
9 changed files with 184 additions and 59 deletions

View File

@@ -0,0 +1,165 @@
package main
import (
"fmt"
"log"
"os"
"regexp"
"strings"
"github.com/ledongthuc/pdf"
)
func main() {
if len(os.Args) < 2 {
fmt.Println("Usage: pdf-to-chapters <input.pdf>")
os.Exit(1)
}
inputPath := os.Args[1]
if err := processPDF(inputPath); err != nil {
log.Fatal(err)
}
}
func processPDF(inputPath string) error {
pdf.DebugOn = false
f, r, err := pdf.Open(inputPath)
if err != nil {
return fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()
totalPages := r.NumPage()
fmt.Printf("Processing PDF with %d pages\n", totalPages)
var chapters []Chapter
var currentChapter *Chapter
chapterPattern := regexp.MustCompile(`The Eminence in Shadow\s+(\d+)\s*-\s*(\d+)`)
for i := 1; i <= totalPages; i++ {
page := r.Page(i)
if err := page.IsValid(); err != nil {
log.Printf("Warning: page %d not valid: %v", i, err)
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
log.Printf("Warning: failed to extract text from page %d: %v", i, err)
continue
}
// Check for chapter header on this page
matches := chapterPattern.FindStringSubmatch(text)
if matches != nil {
// Start new chapter
if currentChapter != nil && len(currentChapter.Content) > 0 {
chapters = append(chapters, *currentChapter)
}
chapterNum := matches[1]
currentChapter = &Chapter{
Number: chapterNum,
StartPage: i,
Content: text,
}
continue
}
// Append to current chapter
if currentChapter != nil {
currentChapter.Content += "\n" + text
}
}
// Don't forget last chapter
if currentChapter != nil && len(currentChapter.Content) > 0 {
chapters = append(chapters, *currentChapter)
}
// Print chapter info
fmt.Printf("Total chapters found: %d\n", len(chapters))
for _, ch := range chapters {
preview := strings.TrimSpace(ch.Content)
if len(preview) > 200 {
preview = preview[:200] + "..."
}
fmt.Printf("Chapter %s (page %d): %s\n", ch.Number, ch.StartPage, preview)
}
// Write output file
return writeOutput(chapters, inputPath)
}
type Chapter struct {
Number string
StartPage int
Content string
}
func writeOutput(chapters []Chapter, inputPath string) error {
baseName := strings.TrimSuffix(inputPath, ".pdf")
outPath := baseName + "_chapters.txt"
f, err := os.Create(outPath)
if err != nil {
return fmt.Errorf("failed to create output: %w", err)
}
defer f.Close()
for i, ch := range chapters {
if i > 0 {
fmt.Fprintln(f)
}
fmt.Fprintf(f, "## Chapter %s\n\n", ch.Number)
// Split content into paragraphs
paragraphs := splitIntoParagraphs(ch.Content)
for _, para := range paragraphs {
trimmed := strings.TrimSpace(para)
if len(trimmed) > 0 {
fmt.Fprintln(f, trimmed)
fmt.Fprintln(f)
}
}
}
fmt.Printf("\nOutput written to: %s\n", outPath)
return nil
}
func splitIntoParagraphs(text string) []string {
lines := strings.Split(text, "\n")
var paragraphs []string
var currentPara strings.Builder
for _, line := range lines {
trimmed := strings.TrimSpace(line)
// Skip empty lines and very short lines (likely headers/page numbers)
if len(trimmed) == 0 {
if currentPara.Len() > 0 {
paragraphs = append(paragraphs, currentPara.String())
currentPara.Reset()
}
continue
}
if len(trimmed) < 3 {
continue
}
if currentPara.Len() > 0 {
currentPara.WriteString(" ")
}
currentPara.WriteString(trimmed)
}
if currentPara.Len() > 0 {
paragraphs = append(paragraphs, currentPara.String())
}
return paragraphs
}

View File

@@ -3,6 +3,7 @@ module github.com/libnovel/backend
go 1.26.1
require (
github.com/ledongthuc/pdf v0.0.0-20241014091450-14fc3c58b12d
github.com/minio/minio-go/v7 v7.0.98
golang.org/x/net v0.51.0
)

View File

@@ -40,10 +40,6 @@ func (c *Consumer) FinishTranslationTask(ctx context.Context, id string, result
return c.pb.FinishTranslationTask(ctx, id, result)
}
func (c *Consumer) FinishImportTask(ctx context.Context, id string, result domain.ImportResult) error {
return c.pb.FinishImportTask(ctx, id, result)
}
func (c *Consumer) FailTask(ctx context.Context, id, errMsg string) error {
return c.pb.FailTask(ctx, id, errMsg)
}
@@ -64,12 +60,6 @@ func (c *Consumer) ClaimNextTranslationTask(ctx context.Context, workerID string
return c.pb.ClaimNextTranslationTask(ctx, workerID)
}
// ClaimNextImportTask delegates to PocketBase because import tasks
// are stored in PocketBase (not Redis/Asynq) and must still be polled directly.
func (c *Consumer) ClaimNextImportTask(ctx context.Context, workerID string) (domain.ImportTask, bool, error) {
return c.pb.ClaimNextImportTask(ctx, workerID)
}
func (c *Consumer) HeartbeatTask(ctx context.Context, id string) error {
return c.pb.HeartbeatTask(ctx, id)
}

View File

@@ -1,6 +1,7 @@
package backend
import (
"context"
"encoding/json"
"fmt"
"io"
@@ -9,13 +10,13 @@ import (
"strings"
"time"
"github.com/libnovel/backend/internal/storage"
"github.com/libnovel/backend/internal/asynqqueue"
)
type importRequest struct {
Title string `json:"title"`
FileName string `json:"file_name"`
FileType string `json:"file_type"` // "pdf" or "epub"
Title string `json:"title"`
FileName string `json:"file_name"`
FileType string `json:"file_type"` // "pdf" or "epub"
ObjectKey string `json:"object_key"` // MinIO path to uploaded file
}
@@ -25,7 +26,7 @@ type importResponse struct {
}
func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
if s.deps.Producer == nil {
if s.deps.TaskProducer == nil {
jsonError(w, http.StatusServiceUnavailable, "task queue not configured")
return
}
@@ -63,14 +64,8 @@ func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
return
}
// Upload to MinIO directly via the store
objectKey = fmt.Sprintf("imports/%d_%s", time.Now().Unix(), header.Filename)
store, ok := s.deps.Producer.(*storage.Store)
if !ok {
jsonError(w, http.StatusInternalServerError, "storage not available")
return
}
if err := store.PutImportFile(r.Context(), objectKey, data); err != nil {
if err := s.deps.PresignStore.PutObject(r.Context(), "imports", objectKey, data); err != nil {
jsonError(w, http.StatusInternalServerError, "upload file: "+err.Error())
return
}
@@ -99,7 +94,7 @@ func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
return -1
}, slug)
taskID, err := s.deps.Producer.CreateImportTask(r.Context(), slug, req.Title, req.FileType, objectKey)
taskID, err := s.deps.TaskProducer.CreateImportTask(r.Context(), slug, req.Title, req.FileType, objectKey)
if err != nil {
jsonError(w, http.StatusInternalServerError, "create import task: "+err.Error())
return
@@ -118,12 +113,8 @@ func (s *Server) handleAdminImportStatus(w http.ResponseWriter, r *http.Request)
return
}
task, ok, err := s.deps.TaskReader.GetImportTask(r.Context(), taskID)
task, err := s.deps.TaskReader.GetImportTask(r.Context(), taskID)
if err != nil {
jsonError(w, http.StatusInternalServerError, "get task: "+err.Error())
return
}
if !ok {
jsonError(w, http.StatusNotFound, "task not found")
return
}

View File

@@ -727,11 +727,11 @@ func (r *Runner) runImportTask(ctx context.Context, task domain.ImportTask, obje
// storeImportedChapters stores imported chapters in MinIO (similar to scraped chapters).
func (r *Runner) storeImportedChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
for _, ch := range chapters {
_ = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
content := fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
if ch.Title != "" {
_ = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
}
_ = fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
// Use MinIO client directly since we have access to it via BookWriter/Store
// In a real implementation, this would be abstracted through BookWriter
r.deps.Log.Info("runner: stored chapter", "slug", slug, "chapter", ch.Number)

View File

@@ -54,10 +54,6 @@ func (s *stubConsumer) ClaimNextTranslationTask(_ context.Context, _ string) (do
return domain.TranslationTask{}, false, nil
}
func (s *stubConsumer) ClaimNextImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
return domain.ImportTask{}, false, nil
}
func (s *stubConsumer) FinishScrapeTask(_ context.Context, id string, _ domain.ScrapeResult) error {
s.finished = append(s.finished, id)
return nil
@@ -73,11 +69,6 @@ func (s *stubConsumer) FinishTranslationTask(_ context.Context, id string, _ dom
return nil
}
func (s *stubConsumer) FinishImportTask(_ context.Context, id string, _ domain.ImportResult) error {
s.finished = append(s.finished, id)
return nil
}
func (s *stubConsumer) FailTask(_ context.Context, id, _ string) error {
s.failCalled = append(s.failCalled, id)
return nil

View File

@@ -129,7 +129,9 @@ func (s *Store) IngestChapters(ctx context.Context, slug string, chapters []book
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
}
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
if err := s.mc.putObject(ctx, "books", key, "text/markdown", []byte(content)); err != nil {
if err := s.mc.PutObject(ctx, "books", key, strings.NewReader(content), int64(len(content)), minio.PutObjectOptions{
ContentType: "text/markdown",
}); err != nil {
return fmt.Errorf("put chapter %d: %w", ch.Number, err)
}
}

View File

@@ -979,6 +979,9 @@ func (s *Store) GetImportTask(ctx context.Context, id string) (domain.ImportTask
t, err := parseImportTask(raw)
return t, err == nil, err
}
t, err := parseTranslationTask(items[0])
return t, err == nil, err
}
// ── Parsers ───────────────────────────────────────────────────────────────────
@@ -1148,11 +1151,6 @@ func (s *Store) GetCover(ctx context.Context, slug string) ([]byte, string, bool
return data, ct, true, nil
}
// PutImportFile stores an uploaded import file (PDF/EPUB) in MinIO.
func (s *Store) PutImportFile(ctx context.Context, key string, data []byte) error {
return s.mc.putObject(ctx, "imports", key, "application/octet-stream", data)
}
func (s *Store) CoverExists(ctx context.Context, slug string) bool {
return s.mc.coverExists(ctx, CoverObjectKey(slug))
}

View File

@@ -26,9 +26,6 @@ func (s *stubStore) CreateAudioTask(_ context.Context, _ string, _ int, _ string
func (s *stubStore) CreateTranslationTask(_ context.Context, _ string, _ int, _ string) (string, error) {
return "translation-1", nil
}
func (s *stubStore) CreateImportTask(_ context.Context, _, _, _, _ string) (string, error) {
return "import-1", nil
}
func (s *stubStore) CancelTask(_ context.Context, _ string) error { return nil }
func (s *stubStore) CancelAudioTasksBySlug(_ context.Context, _ string) (int, error) { return 0, nil }
@@ -41,9 +38,6 @@ func (s *stubStore) ClaimNextAudioTask(_ context.Context, _ string) (domain.Audi
func (s *stubStore) ClaimNextTranslationTask(_ context.Context, _ string) (domain.TranslationTask, bool, error) {
return domain.TranslationTask{ID: "translation-1", Status: domain.TaskStatusRunning}, true, nil
}
func (s *stubStore) ClaimNextImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
return domain.ImportTask{ID: "import-1", Status: domain.TaskStatusRunning}, true, nil
}
func (s *stubStore) FinishScrapeTask(_ context.Context, _ string, _ domain.ScrapeResult) error {
return nil
}
@@ -53,9 +47,6 @@ func (s *stubStore) FinishAudioTask(_ context.Context, _ string, _ domain.AudioR
func (s *stubStore) FinishTranslationTask(_ context.Context, _ string, _ domain.TranslationResult) error {
return nil
}
func (s *stubStore) FinishImportTask(_ context.Context, _ string, _ domain.ImportResult) error {
return nil
}
func (s *stubStore) FailTask(_ context.Context, _, _ string) error { return nil }
func (s *stubStore) HeartbeatTask(_ context.Context, _ string) error { return nil }
@@ -78,10 +69,6 @@ func (s *stubStore) ListTranslationTasks(_ context.Context) ([]domain.Translatio
func (s *stubStore) GetTranslationTask(_ context.Context, _ string) (domain.TranslationTask, bool, error) {
return domain.TranslationTask{}, false, nil
}
func (s *stubStore) ListImportTasks(_ context.Context) ([]domain.ImportTask, error) { return nil, nil }
func (s *stubStore) GetImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
return domain.ImportTask{}, false, nil
}
// Verify the stub satisfies all three interfaces at compile time.
var _ taskqueue.Producer = (*stubStore)(nil)