Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8a7151fee |
@@ -1,165 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Println("Usage: pdf-to-chapters <input.pdf>")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
inputPath := os.Args[1]
|
||||
|
||||
if err := processPDF(inputPath); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func processPDF(inputPath string) error {
|
||||
pdf.DebugOn = false
|
||||
|
||||
f, r, err := pdf.Open(inputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open PDF: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
totalPages := r.NumPage()
|
||||
fmt.Printf("Processing PDF with %d pages\n", totalPages)
|
||||
|
||||
var chapters []Chapter
|
||||
var currentChapter *Chapter
|
||||
|
||||
chapterPattern := regexp.MustCompile(`The Eminence in Shadow\s+(\d+)\s*-\s*(\d+)`)
|
||||
|
||||
for i := 1; i <= totalPages; i++ {
|
||||
page := r.Page(i)
|
||||
if err := page.IsValid(); err != nil {
|
||||
log.Printf("Warning: page %d not valid: %v", i, err)
|
||||
continue
|
||||
}
|
||||
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
log.Printf("Warning: failed to extract text from page %d: %v", i, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for chapter header on this page
|
||||
matches := chapterPattern.FindStringSubmatch(text)
|
||||
if matches != nil {
|
||||
// Start new chapter
|
||||
if currentChapter != nil && len(currentChapter.Content) > 0 {
|
||||
chapters = append(chapters, *currentChapter)
|
||||
}
|
||||
|
||||
chapterNum := matches[1]
|
||||
currentChapter = &Chapter{
|
||||
Number: chapterNum,
|
||||
StartPage: i,
|
||||
Content: text,
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Append to current chapter
|
||||
if currentChapter != nil {
|
||||
currentChapter.Content += "\n" + text
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget last chapter
|
||||
if currentChapter != nil && len(currentChapter.Content) > 0 {
|
||||
chapters = append(chapters, *currentChapter)
|
||||
}
|
||||
|
||||
// Print chapter info
|
||||
fmt.Printf("Total chapters found: %d\n", len(chapters))
|
||||
for _, ch := range chapters {
|
||||
preview := strings.TrimSpace(ch.Content)
|
||||
if len(preview) > 200 {
|
||||
preview = preview[:200] + "..."
|
||||
}
|
||||
fmt.Printf("Chapter %s (page %d): %s\n", ch.Number, ch.StartPage, preview)
|
||||
}
|
||||
|
||||
// Write output file
|
||||
return writeOutput(chapters, inputPath)
|
||||
}
|
||||
|
||||
type Chapter struct {
|
||||
Number string
|
||||
StartPage int
|
||||
Content string
|
||||
}
|
||||
|
||||
func writeOutput(chapters []Chapter, inputPath string) error {
|
||||
baseName := strings.TrimSuffix(inputPath, ".pdf")
|
||||
outPath := baseName + "_chapters.txt"
|
||||
|
||||
f, err := os.Create(outPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
for i, ch := range chapters {
|
||||
if i > 0 {
|
||||
fmt.Fprintln(f)
|
||||
}
|
||||
fmt.Fprintf(f, "## Chapter %s\n\n", ch.Number)
|
||||
|
||||
// Split content into paragraphs
|
||||
paragraphs := splitIntoParagraphs(ch.Content)
|
||||
for _, para := range paragraphs {
|
||||
trimmed := strings.TrimSpace(para)
|
||||
if len(trimmed) > 0 {
|
||||
fmt.Fprintln(f, trimmed)
|
||||
fmt.Fprintln(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("\nOutput written to: %s\n", outPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
func splitIntoParagraphs(text string) []string {
|
||||
lines := strings.Split(text, "\n")
|
||||
var paragraphs []string
|
||||
var currentPara strings.Builder
|
||||
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
// Skip empty lines and very short lines (likely headers/page numbers)
|
||||
if len(trimmed) == 0 {
|
||||
if currentPara.Len() > 0 {
|
||||
paragraphs = append(paragraphs, currentPara.String())
|
||||
currentPara.Reset()
|
||||
}
|
||||
continue
|
||||
}
|
||||
if len(trimmed) < 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
if currentPara.Len() > 0 {
|
||||
currentPara.WriteString(" ")
|
||||
}
|
||||
currentPara.WriteString(trimmed)
|
||||
}
|
||||
|
||||
if currentPara.Len() > 0 {
|
||||
paragraphs = append(paragraphs, currentPara.String())
|
||||
}
|
||||
|
||||
return paragraphs
|
||||
}
|
||||
Reference in New Issue
Block a user