Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6617828487 |
165
backend/cmd/pdf-to-chapters/main.go
Normal file
165
backend/cmd/pdf-to-chapters/main.go
Normal file
@@ -0,0 +1,165 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Println("Usage: pdf-to-chapters <input.pdf>")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
inputPath := os.Args[1]
|
||||
|
||||
if err := processPDF(inputPath); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func processPDF(inputPath string) error {
|
||||
pdf.DebugOn = false
|
||||
|
||||
f, r, err := pdf.Open(inputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open PDF: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
totalPages := r.NumPage()
|
||||
fmt.Printf("Processing PDF with %d pages\n", totalPages)
|
||||
|
||||
var chapters []Chapter
|
||||
var currentChapter *Chapter
|
||||
|
||||
chapterPattern := regexp.MustCompile(`The Eminence in Shadow\s+(\d+)\s*-\s*(\d+)`)
|
||||
|
||||
for i := 1; i <= totalPages; i++ {
|
||||
page := r.Page(i)
|
||||
if err := page.IsValid(); err != nil {
|
||||
log.Printf("Warning: page %d not valid: %v", i, err)
|
||||
continue
|
||||
}
|
||||
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
log.Printf("Warning: failed to extract text from page %d: %v", i, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for chapter header on this page
|
||||
matches := chapterPattern.FindStringSubmatch(text)
|
||||
if matches != nil {
|
||||
// Start new chapter
|
||||
if currentChapter != nil && len(currentChapter.Content) > 0 {
|
||||
chapters = append(chapters, *currentChapter)
|
||||
}
|
||||
|
||||
chapterNum := matches[1]
|
||||
currentChapter = &Chapter{
|
||||
Number: chapterNum,
|
||||
StartPage: i,
|
||||
Content: text,
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Append to current chapter
|
||||
if currentChapter != nil {
|
||||
currentChapter.Content += "\n" + text
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget last chapter
|
||||
if currentChapter != nil && len(currentChapter.Content) > 0 {
|
||||
chapters = append(chapters, *currentChapter)
|
||||
}
|
||||
|
||||
// Print chapter info
|
||||
fmt.Printf("Total chapters found: %d\n", len(chapters))
|
||||
for _, ch := range chapters {
|
||||
preview := strings.TrimSpace(ch.Content)
|
||||
if len(preview) > 200 {
|
||||
preview = preview[:200] + "..."
|
||||
}
|
||||
fmt.Printf("Chapter %s (page %d): %s\n", ch.Number, ch.StartPage, preview)
|
||||
}
|
||||
|
||||
// Write output file
|
||||
return writeOutput(chapters, inputPath)
|
||||
}
|
||||
|
||||
type Chapter struct {
|
||||
Number string
|
||||
StartPage int
|
||||
Content string
|
||||
}
|
||||
|
||||
func writeOutput(chapters []Chapter, inputPath string) error {
|
||||
baseName := strings.TrimSuffix(inputPath, ".pdf")
|
||||
outPath := baseName + "_chapters.txt"
|
||||
|
||||
f, err := os.Create(outPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
for i, ch := range chapters {
|
||||
if i > 0 {
|
||||
fmt.Fprintln(f)
|
||||
}
|
||||
fmt.Fprintf(f, "## Chapter %s\n\n", ch.Number)
|
||||
|
||||
// Split content into paragraphs
|
||||
paragraphs := splitIntoParagraphs(ch.Content)
|
||||
for _, para := range paragraphs {
|
||||
trimmed := strings.TrimSpace(para)
|
||||
if len(trimmed) > 0 {
|
||||
fmt.Fprintln(f, trimmed)
|
||||
fmt.Fprintln(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("\nOutput written to: %s\n", outPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
func splitIntoParagraphs(text string) []string {
|
||||
lines := strings.Split(text, "\n")
|
||||
var paragraphs []string
|
||||
var currentPara strings.Builder
|
||||
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
// Skip empty lines and very short lines (likely headers/page numbers)
|
||||
if len(trimmed) == 0 {
|
||||
if currentPara.Len() > 0 {
|
||||
paragraphs = append(paragraphs, currentPara.String())
|
||||
currentPara.Reset()
|
||||
}
|
||||
continue
|
||||
}
|
||||
if len(trimmed) < 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
if currentPara.Len() > 0 {
|
||||
currentPara.WriteString(" ")
|
||||
}
|
||||
currentPara.WriteString(trimmed)
|
||||
}
|
||||
|
||||
if currentPara.Len() > 0 {
|
||||
paragraphs = append(paragraphs, currentPara.String())
|
||||
}
|
||||
|
||||
return paragraphs
|
||||
}
|
||||
@@ -3,6 +3,7 @@ module github.com/libnovel/backend
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/ledongthuc/pdf v0.0.0-20241014091450-14fc3c58b12d
|
||||
github.com/minio/minio-go/v7 v7.0.98
|
||||
golang.org/x/net v0.51.0
|
||||
)
|
||||
|
||||
@@ -40,10 +40,6 @@ func (c *Consumer) FinishTranslationTask(ctx context.Context, id string, result
|
||||
return c.pb.FinishTranslationTask(ctx, id, result)
|
||||
}
|
||||
|
||||
func (c *Consumer) FinishImportTask(ctx context.Context, id string, result domain.ImportResult) error {
|
||||
return c.pb.FinishImportTask(ctx, id, result)
|
||||
}
|
||||
|
||||
func (c *Consumer) FailTask(ctx context.Context, id, errMsg string) error {
|
||||
return c.pb.FailTask(ctx, id, errMsg)
|
||||
}
|
||||
@@ -64,12 +60,6 @@ func (c *Consumer) ClaimNextTranslationTask(ctx context.Context, workerID string
|
||||
return c.pb.ClaimNextTranslationTask(ctx, workerID)
|
||||
}
|
||||
|
||||
// ClaimNextImportTask delegates to PocketBase because import tasks
|
||||
// are stored in PocketBase (not Redis/Asynq) and must still be polled directly.
|
||||
func (c *Consumer) ClaimNextImportTask(ctx context.Context, workerID string) (domain.ImportTask, bool, error) {
|
||||
return c.pb.ClaimNextImportTask(ctx, workerID)
|
||||
}
|
||||
|
||||
func (c *Consumer) HeartbeatTask(ctx context.Context, id string) error {
|
||||
return c.pb.HeartbeatTask(ctx, id)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -9,13 +10,13 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/storage"
|
||||
"github.com/libnovel/backend/internal/asynqqueue"
|
||||
)
|
||||
|
||||
type importRequest struct {
|
||||
Title string `json:"title"`
|
||||
FileName string `json:"file_name"`
|
||||
FileType string `json:"file_type"` // "pdf" or "epub"
|
||||
Title string `json:"title"`
|
||||
FileName string `json:"file_name"`
|
||||
FileType string `json:"file_type"` // "pdf" or "epub"
|
||||
ObjectKey string `json:"object_key"` // MinIO path to uploaded file
|
||||
}
|
||||
|
||||
@@ -25,7 +26,7 @@ type importResponse struct {
|
||||
}
|
||||
|
||||
func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
|
||||
if s.deps.Producer == nil {
|
||||
if s.deps.TaskProducer == nil {
|
||||
jsonError(w, http.StatusServiceUnavailable, "task queue not configured")
|
||||
return
|
||||
}
|
||||
@@ -63,14 +64,8 @@ func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Upload to MinIO directly via the store
|
||||
objectKey = fmt.Sprintf("imports/%d_%s", time.Now().Unix(), header.Filename)
|
||||
store, ok := s.deps.Producer.(*storage.Store)
|
||||
if !ok {
|
||||
jsonError(w, http.StatusInternalServerError, "storage not available")
|
||||
return
|
||||
}
|
||||
if err := store.PutImportFile(r.Context(), objectKey, data); err != nil {
|
||||
if err := s.deps.PresignStore.PutObject(r.Context(), "imports", objectKey, data); err != nil {
|
||||
jsonError(w, http.StatusInternalServerError, "upload file: "+err.Error())
|
||||
return
|
||||
}
|
||||
@@ -99,7 +94,7 @@ func (s *Server) handleAdminImport(w http.ResponseWriter, r *http.Request) {
|
||||
return -1
|
||||
}, slug)
|
||||
|
||||
taskID, err := s.deps.Producer.CreateImportTask(r.Context(), slug, req.Title, req.FileType, objectKey)
|
||||
taskID, err := s.deps.TaskProducer.CreateImportTask(r.Context(), slug, req.Title, req.FileType, objectKey)
|
||||
if err != nil {
|
||||
jsonError(w, http.StatusInternalServerError, "create import task: "+err.Error())
|
||||
return
|
||||
@@ -118,12 +113,8 @@ func (s *Server) handleAdminImportStatus(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
task, ok, err := s.deps.TaskReader.GetImportTask(r.Context(), taskID)
|
||||
task, err := s.deps.TaskReader.GetImportTask(r.Context(), taskID)
|
||||
if err != nil {
|
||||
jsonError(w, http.StatusInternalServerError, "get task: "+err.Error())
|
||||
return
|
||||
}
|
||||
if !ok {
|
||||
jsonError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -727,11 +727,11 @@ func (r *Runner) runImportTask(ctx context.Context, task domain.ImportTask, obje
|
||||
// storeImportedChapters stores imported chapters in MinIO (similar to scraped chapters).
|
||||
func (r *Runner) storeImportedChapters(ctx context.Context, slug string, chapters []bookstore.Chapter) error {
|
||||
for _, ch := range chapters {
|
||||
_ = fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
|
||||
content := fmt.Sprintf("# Chapter %d\n\n%s", ch.Number, ch.Content)
|
||||
if ch.Title != "" {
|
||||
_ = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
|
||||
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
|
||||
}
|
||||
_ = fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
|
||||
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
|
||||
// Use MinIO client directly since we have access to it via BookWriter/Store
|
||||
// In a real implementation, this would be abstracted through BookWriter
|
||||
r.deps.Log.Info("runner: stored chapter", "slug", slug, "chapter", ch.Number)
|
||||
|
||||
@@ -54,10 +54,6 @@ func (s *stubConsumer) ClaimNextTranslationTask(_ context.Context, _ string) (do
|
||||
return domain.TranslationTask{}, false, nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) ClaimNextImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
|
||||
return domain.ImportTask{}, false, nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FinishScrapeTask(_ context.Context, id string, _ domain.ScrapeResult) error {
|
||||
s.finished = append(s.finished, id)
|
||||
return nil
|
||||
@@ -73,11 +69,6 @@ func (s *stubConsumer) FinishTranslationTask(_ context.Context, id string, _ dom
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FinishImportTask(_ context.Context, id string, _ domain.ImportResult) error {
|
||||
s.finished = append(s.finished, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FailTask(_ context.Context, id, _ string) error {
|
||||
s.failCalled = append(s.failCalled, id)
|
||||
return nil
|
||||
|
||||
@@ -129,7 +129,9 @@ func (s *Store) IngestChapters(ctx context.Context, slug string, chapters []book
|
||||
content = fmt.Sprintf("# %s\n\n%s", ch.Title, ch.Content)
|
||||
}
|
||||
key := fmt.Sprintf("books/%s/chapters/%d.md", slug, ch.Number)
|
||||
if err := s.mc.putObject(ctx, "books", key, "text/markdown", []byte(content)); err != nil {
|
||||
if err := s.mc.PutObject(ctx, "books", key, strings.NewReader(content), int64(len(content)), minio.PutObjectOptions{
|
||||
ContentType: "text/markdown",
|
||||
}); err != nil {
|
||||
return fmt.Errorf("put chapter %d: %w", ch.Number, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -979,6 +979,9 @@ func (s *Store) GetImportTask(ctx context.Context, id string) (domain.ImportTask
|
||||
t, err := parseImportTask(raw)
|
||||
return t, err == nil, err
|
||||
}
|
||||
t, err := parseTranslationTask(items[0])
|
||||
return t, err == nil, err
|
||||
}
|
||||
|
||||
// ── Parsers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -1148,11 +1151,6 @@ func (s *Store) GetCover(ctx context.Context, slug string) ([]byte, string, bool
|
||||
return data, ct, true, nil
|
||||
}
|
||||
|
||||
// PutImportFile stores an uploaded import file (PDF/EPUB) in MinIO.
|
||||
func (s *Store) PutImportFile(ctx context.Context, key string, data []byte) error {
|
||||
return s.mc.putObject(ctx, "imports", key, "application/octet-stream", data)
|
||||
}
|
||||
|
||||
func (s *Store) CoverExists(ctx context.Context, slug string) bool {
|
||||
return s.mc.coverExists(ctx, CoverObjectKey(slug))
|
||||
}
|
||||
|
||||
@@ -26,9 +26,6 @@ func (s *stubStore) CreateAudioTask(_ context.Context, _ string, _ int, _ string
|
||||
func (s *stubStore) CreateTranslationTask(_ context.Context, _ string, _ int, _ string) (string, error) {
|
||||
return "translation-1", nil
|
||||
}
|
||||
func (s *stubStore) CreateImportTask(_ context.Context, _, _, _, _ string) (string, error) {
|
||||
return "import-1", nil
|
||||
}
|
||||
func (s *stubStore) CancelTask(_ context.Context, _ string) error { return nil }
|
||||
func (s *stubStore) CancelAudioTasksBySlug(_ context.Context, _ string) (int, error) { return 0, nil }
|
||||
|
||||
@@ -41,9 +38,6 @@ func (s *stubStore) ClaimNextAudioTask(_ context.Context, _ string) (domain.Audi
|
||||
func (s *stubStore) ClaimNextTranslationTask(_ context.Context, _ string) (domain.TranslationTask, bool, error) {
|
||||
return domain.TranslationTask{ID: "translation-1", Status: domain.TaskStatusRunning}, true, nil
|
||||
}
|
||||
func (s *stubStore) ClaimNextImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
|
||||
return domain.ImportTask{ID: "import-1", Status: domain.TaskStatusRunning}, true, nil
|
||||
}
|
||||
func (s *stubStore) FinishScrapeTask(_ context.Context, _ string, _ domain.ScrapeResult) error {
|
||||
return nil
|
||||
}
|
||||
@@ -53,9 +47,6 @@ func (s *stubStore) FinishAudioTask(_ context.Context, _ string, _ domain.AudioR
|
||||
func (s *stubStore) FinishTranslationTask(_ context.Context, _ string, _ domain.TranslationResult) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubStore) FinishImportTask(_ context.Context, _ string, _ domain.ImportResult) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubStore) FailTask(_ context.Context, _, _ string) error { return nil }
|
||||
|
||||
func (s *stubStore) HeartbeatTask(_ context.Context, _ string) error { return nil }
|
||||
@@ -78,10 +69,6 @@ func (s *stubStore) ListTranslationTasks(_ context.Context) ([]domain.Translatio
|
||||
func (s *stubStore) GetTranslationTask(_ context.Context, _ string) (domain.TranslationTask, bool, error) {
|
||||
return domain.TranslationTask{}, false, nil
|
||||
}
|
||||
func (s *stubStore) ListImportTasks(_ context.Context) ([]domain.ImportTask, error) { return nil, nil }
|
||||
func (s *stubStore) GetImportTask(_ context.Context, _ string) (domain.ImportTask, bool, error) {
|
||||
return domain.ImportTask{}, false, nil
|
||||
}
|
||||
|
||||
// Verify the stub satisfies all three interfaces at compile time.
|
||||
var _ taskqueue.Producer = (*stubStore)(nil)
|
||||
|
||||
Reference in New Issue
Block a user