libnovel/scraper/internal/writer/writer.go

// Package writer handles persistence of scraped chapters and metadata.
//
// Directory layout:
//
//	static/books/
//	├── {book-slug}/
//	│   ├── metadata.yaml
//	│   ├── vol-0/          (no volume grouping)
//	│   │   ├── 1-50/
//	│   │   │   ├── chapter-1.md
//	│   │   │   └── …
//	│   │   └── 51-100/
//	│   │       └── …
//	│   └── vol-1/
//	│       └── …
package writer

import (
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"

	"github.com/libnovel/scraper/internal/scraper"
	"gopkg.in/yaml.v3"
)

const chaptersPerFolder = 50

// Writer persists scraped content under a configurable root directory.
type Writer struct {
	root string // e.g. "./static/books"
}

// New creates a Writer that stores files under root.
func New(root string) *Writer {
	return &Writer{root: root}
}

// ─── Metadata ─────────────────────────────────────────────────────────────────

// WriteMetadata serialises meta to static/books/{slug}/metadata.yaml.
// It creates the directory if it does not exist and overwrites any existing file.
func (w *Writer) WriteMetadata(meta scraper.BookMeta) error {
	dir := w.bookDir(meta.Slug)
	if err := os.MkdirAll(dir, 0o755); err != nil {
		return fmt.Errorf("writer: mkdir %s: %w", dir, err)
	}

	path := filepath.Join(dir, "metadata.yaml")
	f, err := os.Create(path)
	if err != nil {
		return fmt.Errorf("writer: create metadata %s: %w", path, err)
	}
	defer f.Close()

	enc := yaml.NewEncoder(f)
	enc.SetIndent(2)
	if err := enc.Encode(meta); err != nil {
		return fmt.Errorf("writer: encode metadata: %w", err)
	}
	return enc.Close()
}

// ReadMetadata reads the metadata.yaml for slug if it exists.
// Returns (zero-value, false, nil) when the file does not exist.
func (w *Writer) ReadMetadata(slug string) (scraper.BookMeta, bool, error) {
	path := filepath.Join(w.bookDir(slug), "metadata.yaml")
	data, err := os.ReadFile(path)
	if err != nil {
		if os.IsNotExist(err) {
			return scraper.BookMeta{}, false, nil
		}
		return scraper.BookMeta{}, false, fmt.Errorf("writer: read metadata %s: %w", path, err)
	}

	var meta scraper.BookMeta
	if err := yaml.Unmarshal(data, &meta); err != nil {
		return scraper.BookMeta{}, true, fmt.Errorf("writer: unmarshal metadata %s: %w", path, err)
	}
	return meta, true, nil
}

// ─── Chapters ─────────────────────────────────────────────────────────────────

// ChapterExists returns true if the markdown file for ref already exists on disk.
func (w *Writer) ChapterExists(slug string, ref scraper.ChapterRef) bool {
	_, err := os.Stat(w.chapterPath(slug, ref))
	return err == nil
}

// WriteChapter writes chapter.Text to the appropriate markdown file.
// The parent directories are created on demand.
func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
	path := w.chapterPath(slug, chapter.Ref)
	dir := filepath.Dir(path)

	if err := os.MkdirAll(dir, 0o755); err != nil {
		return fmt.Errorf("writer: mkdir %s: %w", dir, err)
	}

	// Build the markdown document.
	var sb strings.Builder
	sb.WriteString("# ")
	sb.WriteString(chapter.Ref.Title)
	sb.WriteString("\n\n")
	sb.WriteString(chapter.Text)
	sb.WriteString("\n")

	if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
		return fmt.Errorf("writer: write chapter %s: %w", path, err)
	}
	return nil
}

// ─── Catalogue helpers ────────────────────────────────────────────────────────

// ListBooks returns metadata for every book that has a metadata.yaml under root.
// Books with unreadable metadata files are silently skipped.
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
	entries, err := os.ReadDir(w.root)
	if err != nil {
		if os.IsNotExist(err) {
			return nil, nil
		}
		return nil, fmt.Errorf("writer: list books: %w", err)
	}
	var books []scraper.BookMeta
	for _, e := range entries {
		if !e.IsDir() {
			continue
		}
		meta, ok, _ := w.ReadMetadata(e.Name())
		if !ok {
			continue
		}
		books = append(books, meta)
	}
	sort.Slice(books, func(i, j int) bool {
		return books[i].Title < books[j].Title
	})
	return books, nil
}

// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk.
// It is cheaper than ListBooks because it only checks for file existence rather
// than fully parsing every YAML file.
func (w *Writer) LocalSlugs() map[string]bool {
	entries, err := os.ReadDir(w.root)
	if err != nil {
		return map[string]bool{}
	}
	slugs := make(map[string]bool, len(entries))
	for _, e := range entries {
		if !e.IsDir() {
			continue
		}
		metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml")
		if _, err := os.Stat(metaPath); err == nil {
			slugs[e.Name()] = true
		}
	}
	return slugs
}

// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
type ChapterInfo struct {
	Number int
	Title  string // chapter name, cleaned of number prefix and trailing date
	Date   string // relative date scraped alongside the title, e.g. "1 year ago"
}

// ListChapters returns all chapters on disk for slug, sorted by number.
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
	bookDir := w.bookDir(slug)
	var chapters []ChapterInfo

	// Walk vol-*/range-*/ directories.
	volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
	if err != nil {
		return nil, fmt.Errorf("writer: list chapters glob: %w", err)
	}
	for _, vd := range volDirs {
		rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
		for _, rd := range rangeDirs {
			files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
			for _, f := range files {
				base := filepath.Base(f) // chapter-N.md
				numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
				n, err := strconv.Atoi(numStr)
				if err != nil {
					continue
				}
				title, date := chapterTitle(f, n)
				chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date})
			}
		}
	}
	sort.Slice(chapters, func(i, j int) bool {
		return chapters[i].Number < chapters[j].Number
	})
	return chapters, nil
}

// chapterTitle reads the first non-empty line of a markdown file and strips
// the leading "# " heading marker. Falls back to "Chapter N".
func chapterTitle(path string, n int) (title, date string) {
	data, err := os.ReadFile(path)
	if err != nil {
		return fmt.Sprintf("Chapter %d", n), ""
	}
	for _, line := range strings.SplitN(string(data), "\n", 10) {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		line = strings.TrimPrefix(line, "# ")
		return splitChapterTitle(line)
	}
	return fmt.Sprintf("Chapter %d", n), ""
}

// splitChapterTitle separates the human-readable chapter name from the
// trailing relative-date string that novelfire.net appends to the heading.
// Examples of raw heading text (after stripping "# "):
//
//	"1                                    Chapter 1 - 1: The Academy's Weakest1 year ago"
//	"2  Chapter 2 - Enter the Storm3 months ago"
//
// The pattern is: optional leading number+whitespace, then the real title,
// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/
func splitChapterTitle(raw string) (title, date string) {
	// Strip a leading chapter-number index that novelfire sometimes prepends.
	// It looks like "1   " or "12  " at the very start.
	raw = strings.TrimSpace(raw)
	if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 {
		prefix := raw[:idx]
		allDigit := true
		for _, c := range prefix {
			if c < '0' || c > '9' {
				allDigit = false
				break
			}
		}
		if allDigit {
			raw = strings.TrimSpace(raw[idx:])
		}
	}

	// Match a trailing relative date: "<n> <unit>[s] ago"
	dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`)
	if m := dateRe.FindStringSubmatchIndex(raw); m != nil {
		return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]])
	}
	return raw, ""
}

// ReadChapter returns the raw markdown content for chapter number n of slug.
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
	// Reconstruct path using the same bucketing formula as chapterPath.
	ref := scraper.ChapterRef{Number: n, Volume: 0}
	path := w.chapterPath(slug, ref)
	data, err := os.ReadFile(path)
	if err != nil {
		return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
	}
	return string(data), nil
}

// ─── Ranking ─────────────────────────────────────────────────────────────────

// RankingItem represents a single entry in the ranking.
type RankingItem struct {
	Rank      int      `yaml:"rank"`
	Slug      string   `yaml:"slug"`
	Title     string   `yaml:"title"`
	Author    string   `yaml:"author,omitempty"`
	Cover     string   `yaml:"cover,omitempty"`
	Status    string   `yaml:"status,omitempty"`
	Genres    []string `yaml:"genres,omitempty"`
	SourceURL string   `yaml:"source_url,omitempty"`
}

// WriteRanking saves the ranking items as markdown to static/ranking.md.
func (w *Writer) WriteRanking(items []RankingItem) error {
	path := filepath.Clean(w.rankingPath())
	dir := filepath.Dir(path)
	if err := os.MkdirAll(dir, 0o755); err != nil {
		return fmt.Errorf("writer: mkdir %s: %w", dir, err)
	}

	var sb strings.Builder
	sb.WriteString("# Novel Rankings\n\n")
	sb.WriteString("| Rank | Title | Cover | Status | Genres | URL |\n")
	sb.WriteString("|------|-------|-------|--------|--------|-----|\n")
	for _, item := range items {
		genres := strings.Join(item.Genres, ", ")
		if genres == "" {
			genres = "-"
		}
		sb.WriteString(fmt.Sprintf("| %d | %s | %s | %s | %s | %s |\n",
			item.Rank, item.Title, item.Cover, item.Status, genres, item.SourceURL))
	}

	if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
		return fmt.Errorf("writer: write ranking %s: %w", path, err)
	}
	return nil
}

// ReadRanking reads the ranking.md file if it exists.
func (w *Writer) ReadRanking() (string, error) {
	path := w.rankingPath()
	data, err := os.ReadFile(path)
	if err != nil {
		if os.IsNotExist(err) {
			return "", nil
		}
		return "", fmt.Errorf("writer: read ranking: %w", err)
	}
	return string(data), nil
}

// ReadRankingItems parses ranking.md back into a slice of RankingItem.
// Returns nil slice (not an error) when the file does not exist yet.
func (w *Writer) ReadRankingItems() ([]RankingItem, error) {
	markdown, err := w.ReadRanking()
	if err != nil || markdown == "" {
		return nil, err
	}

	var items []RankingItem
	for _, line := range strings.Split(markdown, "\n") {
		// Only process data rows: start and end with '|', not header/separator rows.
		line = strings.TrimSpace(line)
		if !strings.HasPrefix(line, "|") || !strings.HasSuffix(line, "|") {
			continue
		}
		// Strip leading/trailing '|' and split on '|'.
		inner := strings.TrimPrefix(strings.TrimSuffix(line, "|"), "|")
		cols := strings.Split(inner, "|")
		if len(cols) < 6 {
			continue
		}
		for i, c := range cols {
			cols[i] = strings.TrimSpace(c)
		}
		// Skip header row and separator row.
		if cols[0] == "Rank" || strings.HasPrefix(cols[0], "---") {
			continue
		}
		rank, err := strconv.Atoi(cols[0])
		if err != nil {
			continue
		}
		title := cols[1]
		cover := cols[2]
		status := cols[3]
		genresStr := cols[4]
		sourceURL := cols[5]

		var genres []string
		if genresStr != "-" && genresStr != "" {
			for _, g := range strings.Split(genresStr, ",") {
				g = strings.TrimSpace(g)
				if g != "" {
					genres = append(genres, g)
				}
			}
		}

		// Derive slug from source URL (last path segment).
		slug := ""
		if sourceURL != "" {
			parts := strings.Split(strings.TrimRight(sourceURL, "/"), "/")
			if len(parts) > 0 {
				slug = parts[len(parts)-1]
			}
		}

		items = append(items, RankingItem{
			Rank:      rank,
			Slug:      slug,
			Title:     title,
			Cover:     cover,
			Status:    status,
			Genres:    genres,
			SourceURL: sourceURL,
		})
	}
	return items, nil
}

// RankingFileInfo returns os.FileInfo for the ranking.md file, if it exists.
func (w *Writer) RankingFileInfo() (os.FileInfo, error) {
	return os.Stat(w.rankingPath())
}

func (w *Writer) rankingPath() string {
	return filepath.Join(w.root, "ranking.md")
}

// bookDir returns the root directory for a book slug.
func (w *Writer) bookDir(slug string) string {
	return filepath.Join(w.root, slug)
}

// chapterPath computes the full file path for a chapter.
//
//	vol-{volume}/{folderRange}/chapter-{number}.md
//
// Example: vol-0/1-50/chapter-1.md, vol-0/51-100/chapter-51.md
func (w *Writer) chapterPath(slug string, ref scraper.ChapterRef) string {
	vol := ref.Volume // 0 == no volume grouping
	volDir := fmt.Sprintf("vol-%d", vol)

	// Folder group: chapters 1-50 → "1-50", 51-100 → "51-100", …
	lo := ((ref.Number-1)/chaptersPerFolder)*chaptersPerFolder + 1
	hi := lo + chaptersPerFolder - 1
	rangeDir := fmt.Sprintf("%d-%d", lo, hi)

	filename := fmt.Sprintf("chapter-%d.md", ref.Number)

	return filepath.Join(w.bookDir(slug), volDir, rangeDir, filename)
}