Files
libnovel/scraper/internal/writer/writer.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

429 lines
13 KiB
Go

// Package writer handles persistence of scraped chapters and metadata.
//
// Directory layout:
//
// static/books/
// ├── {book-slug}/
// │ ├── metadata.yaml
// │ ├── vol-0/ (no volume grouping)
// │ │ ├── 1-50/
// │ │ │ ├── chapter-1.md
// │ │ │ └── …
// │ │ └── 51-100/
// │ │ └── …
// │ └── vol-1/
// │ └── …
package writer
import (
"fmt"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"github.com/libnovel/scraper/internal/scraper"
"gopkg.in/yaml.v3"
)
const chaptersPerFolder = 50
// Writer persists scraped content under a configurable root directory.
type Writer struct {
root string // e.g. "./static/books"
}
// New creates a Writer that stores files under root.
func New(root string) *Writer {
return &Writer{root: root}
}
// ─── Metadata ─────────────────────────────────────────────────────────────────
// WriteMetadata serialises meta to static/books/{slug}/metadata.yaml.
// It creates the directory if it does not exist and overwrites any existing file.
func (w *Writer) WriteMetadata(meta scraper.BookMeta) error {
dir := w.bookDir(meta.Slug)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
}
path := filepath.Join(dir, "metadata.yaml")
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("writer: create metadata %s: %w", path, err)
}
defer f.Close()
enc := yaml.NewEncoder(f)
enc.SetIndent(2)
if err := enc.Encode(meta); err != nil {
return fmt.Errorf("writer: encode metadata: %w", err)
}
return enc.Close()
}
// ReadMetadata reads the metadata.yaml for slug if it exists.
// Returns (zero-value, false, nil) when the file does not exist.
func (w *Writer) ReadMetadata(slug string) (scraper.BookMeta, bool, error) {
path := filepath.Join(w.bookDir(slug), "metadata.yaml")
data, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return scraper.BookMeta{}, false, nil
}
return scraper.BookMeta{}, false, fmt.Errorf("writer: read metadata %s: %w", path, err)
}
var meta scraper.BookMeta
if err := yaml.Unmarshal(data, &meta); err != nil {
return scraper.BookMeta{}, true, fmt.Errorf("writer: unmarshal metadata %s: %w", path, err)
}
return meta, true, nil
}
// ─── Chapters ─────────────────────────────────────────────────────────────────
// ChapterExists returns true if the markdown file for ref already exists on disk.
func (w *Writer) ChapterExists(slug string, ref scraper.ChapterRef) bool {
_, err := os.Stat(w.chapterPath(slug, ref))
return err == nil
}
// WriteChapter writes chapter.Text to the appropriate markdown file.
// The parent directories are created on demand.
func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
path := w.chapterPath(slug, chapter.Ref)
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
}
// Build the markdown document.
var sb strings.Builder
sb.WriteString("# ")
sb.WriteString(chapter.Ref.Title)
sb.WriteString("\n\n")
sb.WriteString(chapter.Text)
sb.WriteString("\n")
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
return fmt.Errorf("writer: write chapter %s: %w", path, err)
}
return nil
}
// ─── Catalogue helpers ────────────────────────────────────────────────────────
// ListBooks returns metadata for every book that has a metadata.yaml under root.
// Books with unreadable metadata files are silently skipped.
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
entries, err := os.ReadDir(w.root)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("writer: list books: %w", err)
}
var books []scraper.BookMeta
for _, e := range entries {
if !e.IsDir() {
continue
}
meta, ok, _ := w.ReadMetadata(e.Name())
if !ok {
continue
}
books = append(books, meta)
}
sort.Slice(books, func(i, j int) bool {
return books[i].Title < books[j].Title
})
return books, nil
}
// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk.
// It is cheaper than ListBooks because it only checks for file existence rather
// than fully parsing every YAML file.
func (w *Writer) LocalSlugs() map[string]bool {
entries, err := os.ReadDir(w.root)
if err != nil {
return map[string]bool{}
}
slugs := make(map[string]bool, len(entries))
for _, e := range entries {
if !e.IsDir() {
continue
}
metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml")
if _, err := os.Stat(metaPath); err == nil {
slugs[e.Name()] = true
}
}
return slugs
}
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
type ChapterInfo struct {
Number int
Title string // chapter name, cleaned of number prefix and trailing date
Date string // relative date scraped alongside the title, e.g. "1 year ago"
}
// ListChapters returns all chapters on disk for slug, sorted by number.
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
bookDir := w.bookDir(slug)
var chapters []ChapterInfo
// Walk vol-*/range-*/ directories.
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
if err != nil {
return nil, fmt.Errorf("writer: list chapters glob: %w", err)
}
for _, vd := range volDirs {
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
for _, rd := range rangeDirs {
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
for _, f := range files {
base := filepath.Base(f) // chapter-N.md
numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
n, err := strconv.Atoi(numStr)
if err != nil {
continue
}
title, date := chapterTitle(f, n)
chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date})
}
}
}
sort.Slice(chapters, func(i, j int) bool {
return chapters[i].Number < chapters[j].Number
})
return chapters, nil
}
// chapterTitle reads the first non-empty line of a markdown file and strips
// the leading "# " heading marker. Falls back to "Chapter N".
func chapterTitle(path string, n int) (title, date string) {
data, err := os.ReadFile(path)
if err != nil {
return fmt.Sprintf("Chapter %d", n), ""
}
for _, line := range strings.SplitN(string(data), "\n", 10) {
line = strings.TrimSpace(line)
if line == "" {
continue
}
line = strings.TrimPrefix(line, "# ")
return splitChapterTitle(line)
}
return fmt.Sprintf("Chapter %d", n), ""
}
// splitChapterTitle separates the human-readable chapter name from the
// trailing relative-date string that novelfire.net appends to the heading.
// Examples of raw heading text (after stripping "# "):
//
// "1 Chapter 1 - 1: The Academy's Weakest1 year ago"
// "2 Chapter 2 - Enter the Storm3 months ago"
//
// The pattern is: optional leading number+whitespace, then the real title,
// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/
func splitChapterTitle(raw string) (title, date string) {
// Strip a leading chapter-number index that novelfire sometimes prepends.
// It looks like "1 " or "12 " at the very start.
raw = strings.TrimSpace(raw)
if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 {
prefix := raw[:idx]
allDigit := true
for _, c := range prefix {
if c < '0' || c > '9' {
allDigit = false
break
}
}
if allDigit {
raw = strings.TrimSpace(raw[idx:])
}
}
// Match a trailing relative date: "<n> <unit>[s] ago"
dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`)
if m := dateRe.FindStringSubmatchIndex(raw); m != nil {
return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]])
}
return raw, ""
}
// ReadChapter returns the raw markdown content for chapter number n of slug.
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
// Reconstruct path using the same bucketing formula as chapterPath.
ref := scraper.ChapterRef{Number: n, Volume: 0}
path := w.chapterPath(slug, ref)
data, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
}
return string(data), nil
}
// ─── Ranking ─────────────────────────────────────────────────────────────────
// RankingItem represents a single entry in the ranking.
type RankingItem struct {
Rank int `yaml:"rank"`
Slug string `yaml:"slug"`
Title string `yaml:"title"`
Author string `yaml:"author,omitempty"`
Cover string `yaml:"cover,omitempty"`
Status string `yaml:"status,omitempty"`
Genres []string `yaml:"genres,omitempty"`
SourceURL string `yaml:"source_url,omitempty"`
}
// WriteRanking saves the ranking items as markdown to static/ranking.md.
func (w *Writer) WriteRanking(items []RankingItem) error {
path := filepath.Clean(w.rankingPath())
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
}
var sb strings.Builder
sb.WriteString("# Novel Rankings\n\n")
sb.WriteString("| Rank | Title | Cover | Status | Genres | URL |\n")
sb.WriteString("|------|-------|-------|--------|--------|-----|\n")
for _, item := range items {
genres := strings.Join(item.Genres, ", ")
if genres == "" {
genres = "-"
}
sb.WriteString(fmt.Sprintf("| %d | %s | %s | %s | %s | %s |\n",
item.Rank, item.Title, item.Cover, item.Status, genres, item.SourceURL))
}
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
return fmt.Errorf("writer: write ranking %s: %w", path, err)
}
return nil
}
// ReadRanking reads the ranking.md file if it exists.
func (w *Writer) ReadRanking() (string, error) {
path := w.rankingPath()
data, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return "", nil
}
return "", fmt.Errorf("writer: read ranking: %w", err)
}
return string(data), nil
}
// ReadRankingItems parses ranking.md back into a slice of RankingItem.
// Returns nil slice (not an error) when the file does not exist yet.
func (w *Writer) ReadRankingItems() ([]RankingItem, error) {
markdown, err := w.ReadRanking()
if err != nil || markdown == "" {
return nil, err
}
var items []RankingItem
for _, line := range strings.Split(markdown, "\n") {
// Only process data rows: start and end with '|', not header/separator rows.
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "|") || !strings.HasSuffix(line, "|") {
continue
}
// Strip leading/trailing '|' and split on '|'.
inner := strings.TrimPrefix(strings.TrimSuffix(line, "|"), "|")
cols := strings.Split(inner, "|")
if len(cols) < 6 {
continue
}
for i, c := range cols {
cols[i] = strings.TrimSpace(c)
}
// Skip header row and separator row.
if cols[0] == "Rank" || strings.HasPrefix(cols[0], "---") {
continue
}
rank, err := strconv.Atoi(cols[0])
if err != nil {
continue
}
title := cols[1]
cover := cols[2]
status := cols[3]
genresStr := cols[4]
sourceURL := cols[5]
var genres []string
if genresStr != "-" && genresStr != "" {
for _, g := range strings.Split(genresStr, ",") {
g = strings.TrimSpace(g)
if g != "" {
genres = append(genres, g)
}
}
}
// Derive slug from source URL (last path segment).
slug := ""
if sourceURL != "" {
parts := strings.Split(strings.TrimRight(sourceURL, "/"), "/")
if len(parts) > 0 {
slug = parts[len(parts)-1]
}
}
items = append(items, RankingItem{
Rank: rank,
Slug: slug,
Title: title,
Cover: cover,
Status: status,
Genres: genres,
SourceURL: sourceURL,
})
}
return items, nil
}
// RankingFileInfo returns os.FileInfo for the ranking.md file, if it exists.
func (w *Writer) RankingFileInfo() (os.FileInfo, error) {
return os.Stat(w.rankingPath())
}
func (w *Writer) rankingPath() string {
return filepath.Join(w.root, "ranking.md")
}
// bookDir returns the root directory for a book slug.
func (w *Writer) bookDir(slug string) string {
return filepath.Join(w.root, slug)
}
// chapterPath computes the full file path for a chapter.
//
// vol-{volume}/{folderRange}/chapter-{number}.md
//
// Example: vol-0/1-50/chapter-1.md, vol-0/51-100/chapter-51.md
func (w *Writer) chapterPath(slug string, ref scraper.ChapterRef) string {
vol := ref.Volume // 0 == no volume grouping
volDir := fmt.Sprintf("vol-%d", vol)
// Folder group: chapters 1-50 → "1-50", 51-100 → "51-100", …
lo := ((ref.Number-1)/chaptersPerFolder)*chaptersPerFolder + 1
hi := lo + chaptersPerFolder - 1
rangeDir := fmt.Sprintf("%d-%d", lo, hi)
filename := fmt.Sprintf("chapter-%d.md", ref.Number)
return filepath.Join(w.bookDir(slug), volDir, rangeDir, filename)
}