- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
429 lines
13 KiB
Go
429 lines
13 KiB
Go
// Package writer handles persistence of scraped chapters and metadata.
|
|
//
|
|
// Directory layout:
|
|
//
|
|
// static/books/
|
|
// ├── {book-slug}/
|
|
// │ ├── metadata.yaml
|
|
// │ ├── vol-0/ (no volume grouping)
|
|
// │ │ ├── 1-50/
|
|
// │ │ │ ├── chapter-1.md
|
|
// │ │ │ └── …
|
|
// │ │ └── 51-100/
|
|
// │ │ └── …
|
|
// │ └── vol-1/
|
|
// │ └── …
|
|
package writer
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/libnovel/scraper/internal/scraper"
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
const chaptersPerFolder = 50
|
|
|
|
// Writer persists scraped content under a configurable root directory.
|
|
type Writer struct {
|
|
root string // e.g. "./static/books"
|
|
}
|
|
|
|
// New creates a Writer that stores files under root.
|
|
func New(root string) *Writer {
|
|
return &Writer{root: root}
|
|
}
|
|
|
|
// ─── Metadata ─────────────────────────────────────────────────────────────────
|
|
|
|
// WriteMetadata serialises meta to static/books/{slug}/metadata.yaml.
|
|
// It creates the directory if it does not exist and overwrites any existing file.
|
|
func (w *Writer) WriteMetadata(meta scraper.BookMeta) error {
|
|
dir := w.bookDir(meta.Slug)
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
|
}
|
|
|
|
path := filepath.Join(dir, "metadata.yaml")
|
|
f, err := os.Create(path)
|
|
if err != nil {
|
|
return fmt.Errorf("writer: create metadata %s: %w", path, err)
|
|
}
|
|
defer f.Close()
|
|
|
|
enc := yaml.NewEncoder(f)
|
|
enc.SetIndent(2)
|
|
if err := enc.Encode(meta); err != nil {
|
|
return fmt.Errorf("writer: encode metadata: %w", err)
|
|
}
|
|
return enc.Close()
|
|
}
|
|
|
|
// ReadMetadata reads the metadata.yaml for slug if it exists.
|
|
// Returns (zero-value, false, nil) when the file does not exist.
|
|
func (w *Writer) ReadMetadata(slug string) (scraper.BookMeta, bool, error) {
|
|
path := filepath.Join(w.bookDir(slug), "metadata.yaml")
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return scraper.BookMeta{}, false, nil
|
|
}
|
|
return scraper.BookMeta{}, false, fmt.Errorf("writer: read metadata %s: %w", path, err)
|
|
}
|
|
|
|
var meta scraper.BookMeta
|
|
if err := yaml.Unmarshal(data, &meta); err != nil {
|
|
return scraper.BookMeta{}, true, fmt.Errorf("writer: unmarshal metadata %s: %w", path, err)
|
|
}
|
|
return meta, true, nil
|
|
}
|
|
|
|
// ─── Chapters ─────────────────────────────────────────────────────────────────
|
|
|
|
// ChapterExists returns true if the markdown file for ref already exists on disk.
|
|
func (w *Writer) ChapterExists(slug string, ref scraper.ChapterRef) bool {
|
|
_, err := os.Stat(w.chapterPath(slug, ref))
|
|
return err == nil
|
|
}
|
|
|
|
// WriteChapter writes chapter.Text to the appropriate markdown file.
|
|
// The parent directories are created on demand.
|
|
func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
|
|
path := w.chapterPath(slug, chapter.Ref)
|
|
dir := filepath.Dir(path)
|
|
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
|
}
|
|
|
|
// Build the markdown document.
|
|
var sb strings.Builder
|
|
sb.WriteString("# ")
|
|
sb.WriteString(chapter.Ref.Title)
|
|
sb.WriteString("\n\n")
|
|
sb.WriteString(chapter.Text)
|
|
sb.WriteString("\n")
|
|
|
|
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
|
|
return fmt.Errorf("writer: write chapter %s: %w", path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ─── Catalogue helpers ────────────────────────────────────────────────────────
|
|
|
|
// ListBooks returns metadata for every book that has a metadata.yaml under root.
|
|
// Books with unreadable metadata files are silently skipped.
|
|
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
|
|
entries, err := os.ReadDir(w.root)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, fmt.Errorf("writer: list books: %w", err)
|
|
}
|
|
var books []scraper.BookMeta
|
|
for _, e := range entries {
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
meta, ok, _ := w.ReadMetadata(e.Name())
|
|
if !ok {
|
|
continue
|
|
}
|
|
books = append(books, meta)
|
|
}
|
|
sort.Slice(books, func(i, j int) bool {
|
|
return books[i].Title < books[j].Title
|
|
})
|
|
return books, nil
|
|
}
|
|
|
|
// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk.
|
|
// It is cheaper than ListBooks because it only checks for file existence rather
|
|
// than fully parsing every YAML file.
|
|
func (w *Writer) LocalSlugs() map[string]bool {
|
|
entries, err := os.ReadDir(w.root)
|
|
if err != nil {
|
|
return map[string]bool{}
|
|
}
|
|
slugs := make(map[string]bool, len(entries))
|
|
for _, e := range entries {
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml")
|
|
if _, err := os.Stat(metaPath); err == nil {
|
|
slugs[e.Name()] = true
|
|
}
|
|
}
|
|
return slugs
|
|
}
|
|
|
|
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
|
|
type ChapterInfo struct {
|
|
Number int
|
|
Title string // chapter name, cleaned of number prefix and trailing date
|
|
Date string // relative date scraped alongside the title, e.g. "1 year ago"
|
|
}
|
|
|
|
// ListChapters returns all chapters on disk for slug, sorted by number.
|
|
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
|
|
bookDir := w.bookDir(slug)
|
|
var chapters []ChapterInfo
|
|
|
|
// Walk vol-*/range-*/ directories.
|
|
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("writer: list chapters glob: %w", err)
|
|
}
|
|
for _, vd := range volDirs {
|
|
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
|
|
for _, rd := range rangeDirs {
|
|
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
|
|
for _, f := range files {
|
|
base := filepath.Base(f) // chapter-N.md
|
|
numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
|
|
n, err := strconv.Atoi(numStr)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
title, date := chapterTitle(f, n)
|
|
chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date})
|
|
}
|
|
}
|
|
}
|
|
sort.Slice(chapters, func(i, j int) bool {
|
|
return chapters[i].Number < chapters[j].Number
|
|
})
|
|
return chapters, nil
|
|
}
|
|
|
|
// chapterTitle reads the first non-empty line of a markdown file and strips
|
|
// the leading "# " heading marker. Falls back to "Chapter N".
|
|
func chapterTitle(path string, n int) (title, date string) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return fmt.Sprintf("Chapter %d", n), ""
|
|
}
|
|
for _, line := range strings.SplitN(string(data), "\n", 10) {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
line = strings.TrimPrefix(line, "# ")
|
|
return splitChapterTitle(line)
|
|
}
|
|
return fmt.Sprintf("Chapter %d", n), ""
|
|
}
|
|
|
|
// splitChapterTitle separates the human-readable chapter name from the
|
|
// trailing relative-date string that novelfire.net appends to the heading.
|
|
// Examples of raw heading text (after stripping "# "):
|
|
//
|
|
// "1 Chapter 1 - 1: The Academy's Weakest1 year ago"
|
|
// "2 Chapter 2 - Enter the Storm3 months ago"
|
|
//
|
|
// The pattern is: optional leading number+whitespace, then the real title,
|
|
// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/
|
|
func splitChapterTitle(raw string) (title, date string) {
|
|
// Strip a leading chapter-number index that novelfire sometimes prepends.
|
|
// It looks like "1 " or "12 " at the very start.
|
|
raw = strings.TrimSpace(raw)
|
|
if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 {
|
|
prefix := raw[:idx]
|
|
allDigit := true
|
|
for _, c := range prefix {
|
|
if c < '0' || c > '9' {
|
|
allDigit = false
|
|
break
|
|
}
|
|
}
|
|
if allDigit {
|
|
raw = strings.TrimSpace(raw[idx:])
|
|
}
|
|
}
|
|
|
|
// Match a trailing relative date: "<n> <unit>[s] ago"
|
|
dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`)
|
|
if m := dateRe.FindStringSubmatchIndex(raw); m != nil {
|
|
return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]])
|
|
}
|
|
return raw, ""
|
|
}
|
|
|
|
// ReadChapter returns the raw markdown content for chapter number n of slug.
|
|
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
|
|
// Reconstruct path using the same bucketing formula as chapterPath.
|
|
ref := scraper.ChapterRef{Number: n, Volume: 0}
|
|
path := w.chapterPath(slug, ref)
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
// ─── Ranking ─────────────────────────────────────────────────────────────────
|
|
|
|
// RankingItem represents a single entry in the ranking.
|
|
type RankingItem struct {
|
|
Rank int `yaml:"rank"`
|
|
Slug string `yaml:"slug"`
|
|
Title string `yaml:"title"`
|
|
Author string `yaml:"author,omitempty"`
|
|
Cover string `yaml:"cover,omitempty"`
|
|
Status string `yaml:"status,omitempty"`
|
|
Genres []string `yaml:"genres,omitempty"`
|
|
SourceURL string `yaml:"source_url,omitempty"`
|
|
}
|
|
|
|
// WriteRanking saves the ranking items as markdown to static/ranking.md.
|
|
func (w *Writer) WriteRanking(items []RankingItem) error {
|
|
path := filepath.Clean(w.rankingPath())
|
|
dir := filepath.Dir(path)
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
|
}
|
|
|
|
var sb strings.Builder
|
|
sb.WriteString("# Novel Rankings\n\n")
|
|
sb.WriteString("| Rank | Title | Cover | Status | Genres | URL |\n")
|
|
sb.WriteString("|------|-------|-------|--------|--------|-----|\n")
|
|
for _, item := range items {
|
|
genres := strings.Join(item.Genres, ", ")
|
|
if genres == "" {
|
|
genres = "-"
|
|
}
|
|
sb.WriteString(fmt.Sprintf("| %d | %s | %s | %s | %s | %s |\n",
|
|
item.Rank, item.Title, item.Cover, item.Status, genres, item.SourceURL))
|
|
}
|
|
|
|
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
|
|
return fmt.Errorf("writer: write ranking %s: %w", path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ReadRanking reads the ranking.md file if it exists.
|
|
func (w *Writer) ReadRanking() (string, error) {
|
|
path := w.rankingPath()
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return "", nil
|
|
}
|
|
return "", fmt.Errorf("writer: read ranking: %w", err)
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
// ReadRankingItems parses ranking.md back into a slice of RankingItem.
|
|
// Returns nil slice (not an error) when the file does not exist yet.
|
|
func (w *Writer) ReadRankingItems() ([]RankingItem, error) {
|
|
markdown, err := w.ReadRanking()
|
|
if err != nil || markdown == "" {
|
|
return nil, err
|
|
}
|
|
|
|
var items []RankingItem
|
|
for _, line := range strings.Split(markdown, "\n") {
|
|
// Only process data rows: start and end with '|', not header/separator rows.
|
|
line = strings.TrimSpace(line)
|
|
if !strings.HasPrefix(line, "|") || !strings.HasSuffix(line, "|") {
|
|
continue
|
|
}
|
|
// Strip leading/trailing '|' and split on '|'.
|
|
inner := strings.TrimPrefix(strings.TrimSuffix(line, "|"), "|")
|
|
cols := strings.Split(inner, "|")
|
|
if len(cols) < 6 {
|
|
continue
|
|
}
|
|
for i, c := range cols {
|
|
cols[i] = strings.TrimSpace(c)
|
|
}
|
|
// Skip header row and separator row.
|
|
if cols[0] == "Rank" || strings.HasPrefix(cols[0], "---") {
|
|
continue
|
|
}
|
|
rank, err := strconv.Atoi(cols[0])
|
|
if err != nil {
|
|
continue
|
|
}
|
|
title := cols[1]
|
|
cover := cols[2]
|
|
status := cols[3]
|
|
genresStr := cols[4]
|
|
sourceURL := cols[5]
|
|
|
|
var genres []string
|
|
if genresStr != "-" && genresStr != "" {
|
|
for _, g := range strings.Split(genresStr, ",") {
|
|
g = strings.TrimSpace(g)
|
|
if g != "" {
|
|
genres = append(genres, g)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Derive slug from source URL (last path segment).
|
|
slug := ""
|
|
if sourceURL != "" {
|
|
parts := strings.Split(strings.TrimRight(sourceURL, "/"), "/")
|
|
if len(parts) > 0 {
|
|
slug = parts[len(parts)-1]
|
|
}
|
|
}
|
|
|
|
items = append(items, RankingItem{
|
|
Rank: rank,
|
|
Slug: slug,
|
|
Title: title,
|
|
Cover: cover,
|
|
Status: status,
|
|
Genres: genres,
|
|
SourceURL: sourceURL,
|
|
})
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
// RankingFileInfo returns os.FileInfo for the ranking.md file, if it exists.
|
|
func (w *Writer) RankingFileInfo() (os.FileInfo, error) {
|
|
return os.Stat(w.rankingPath())
|
|
}
|
|
|
|
func (w *Writer) rankingPath() string {
|
|
return filepath.Join(w.root, "ranking.md")
|
|
}
|
|
|
|
// bookDir returns the root directory for a book slug.
|
|
func (w *Writer) bookDir(slug string) string {
|
|
return filepath.Join(w.root, slug)
|
|
}
|
|
|
|
// chapterPath computes the full file path for a chapter.
|
|
//
|
|
// vol-{volume}/{folderRange}/chapter-{number}.md
|
|
//
|
|
// Example: vol-0/1-50/chapter-1.md, vol-0/51-100/chapter-51.md
|
|
func (w *Writer) chapterPath(slug string, ref scraper.ChapterRef) string {
|
|
vol := ref.Volume // 0 == no volume grouping
|
|
volDir := fmt.Sprintf("vol-%d", vol)
|
|
|
|
// Folder group: chapters 1-50 → "1-50", 51-100 → "51-100", …
|
|
lo := ((ref.Number-1)/chaptersPerFolder)*chaptersPerFolder + 1
|
|
hi := lo + chaptersPerFolder - 1
|
|
rangeDir := fmt.Sprintf("%d-%d", lo, hi)
|
|
|
|
filename := fmt.Sprintf("chapter-%d.md", ref.Number)
|
|
|
|
return filepath.Join(w.bookDir(slug), volDir, rangeDir, filename)
|
|
}
|