From 7879a51fe36605767edfb7cd0b6ba4a008444e9c Mon Sep 17 00:00:00 2001 From: Admin Date: Sun, 1 Mar 2026 12:25:16 +0500 Subject: [PATCH] feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix - Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh ' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination --- .aiignore | 20 + .env.example | 22 +- .gitignore | 4 - AGENTS.md | 89 ++ docker-compose.yml | 29 +- scraper/cmd/scraper/main.go | 55 +- scraper/internal/browser/content_scrape.go | 4 +- scraper/internal/browser/http.go | 68 + scraper/internal/browser/integration_test.go | 152 ++ scraper/internal/browser/interface.go | 19 +- .../internal/novelfire/integration_test.go | 344 +++++ scraper/internal/novelfire/scraper.go | 277 +++- scraper/internal/novelfire/scraper_test.go | 217 +++ scraper/internal/scraper/interfaces.go | 10 + scraper/internal/server/server.go | 57 +- scraper/internal/server/ui.go | 1331 +++++++++++++++++ scraper/internal/writer/writer.go | 206 ++- 17 files changed, 2816 insertions(+), 88 deletions(-) create mode 100644 .aiignore create mode 100644 AGENTS.md create mode 100644 scraper/internal/browser/http.go create mode 100644 scraper/internal/browser/integration_test.go create mode 100644 scraper/internal/novelfire/integration_test.go create mode 100644 scraper/internal/novelfire/scraper_test.go create mode 100644 scraper/internal/server/ui.go diff --git a/.aiignore b/.aiignore new file mode 100644 index 0000000..10b81d0 --- /dev/null +++ b/.aiignore @@ -0,0 +1,20 @@ +# AI Indexing Ignore +# These directories/files are excluded from AI context indexing for speed + +# Generated/scraped content +scraper/static/ + +# Build artifacts +scraper/bin/ +*.exe + +# Dependencies (if using Go modules, the AI doesn't need vendor/) +# vendor/ + +# IDE +.idea/ +.vscode/ +*.swp + +# OS +.DS_Store diff --git a/.env.example b/.env.example index 2120f62..d1e4792 100644 --- a/.env.example +++ b/.env.example @@ -13,11 +13,29 @@ BROWSERLESS_QUEUED=100 # Per-session timeout in ms BROWSERLESS_TIMEOUT=60000 -# Which Browserless strategy the scraper uses: content | scrape | cdp -BROWSERLESS_STRATEGY=content +# Optional webhook URL for Browserless error alerts (leave empty to disable) +ERROR_ALERT_URL= + +# Which Browserless strategy the scraper uses: content | scrape | cdp | direct +BROWSERLESS_STRATEGY=direct + +# Strategy for URL retrieval (chapter list). Uses browserless content strategy by default. +# Set to direct to use plain HTTP, or content/scrape/cdp for browserless. +BROWSERLESS_URL_STRATEGY=content # Chapter worker goroutines (0 = NumCPU inside the container) SCRAPER_WORKERS=0 # Host path to mount as the static output directory STATIC_ROOT=./static/books + +# ── Kokoro-FastAPI TTS ──────────────────────────────────────────────────────── +# Base URL for the Kokoro-FastAPI service. When running via docker-compose the +# default (http://kokoro:8880) is wired in automatically; override here only if +# you are pointing at an external or GPU instance. +KOKORO_URL=http://kokoro:8880 + +# Default voice used for chapter narration. +# Single voices: af_bella, af_sky, af_heart, am_adam, … +# Mixed voices: af_bella+af_sky or af_bella(2)+af_sky(1) (weighted blend) +KOKORO_VOICE=af_bella diff --git a/.gitignore b/.gitignore index 9286c12..4f6ca69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,3 @@ -# ── Compiled binary ──────────────────────────────────────────────────────────── -/scraper -/scraper-* - # ── Go toolchain ─────────────────────────────────────────────────────────────── *.test *.out diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..74ad802 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,89 @@ +# libnovel Project + +Go web scraper for novelfire.net with TTS support via Kokoro-FastAPI. + +## Architecture + +``` +scraper/ +├── cmd/scraper/main.go # Entry point: 'run' (one-shot) and 'serve' (HTTP server) +├── internal/ +│ ├── orchestrator/orchestrator.go # Coordinates catalogue walk, metadata extraction, chapter scraping +│ ├── browser/ # Browser client (content/scrape/cdp strategies) via Browserless +│ ├── novelfire/scraper.go # novelfire.net specific scraping logic +│ ├── server/server.go # HTTP API (POST /scrape, POST /scrape/book) +│ ├── writer/writer.go # File writer (metadata.yaml, chapter .md files) +│ └── scraper/interfaces.go # NovelScraper interface definition +└── static/books/ # Output directory for scraped content +``` + +## Key Concepts + +- **Orchestrator**: Manages concurrency - catalogue streaming → per-book metadata goroutines → chapter worker pool +- **Browser Client**: 3 strategies (content/scrape/cdp) via Browserless Chrome container +- **Writer**: Writes metadata.yaml and chapter markdown files to `static/books/{slug}/vol-0/1-50/` +- **Server**: HTTP API with async scrape jobs, UI for browsing books/chapters, chapter-text endpoint for TTS + +## Commands + +```bash +# Build +cd scraper && go build -o bin/scraper ./cmd/scraper + +# One-shot scrape (full catalogue) +./bin/scraper run + +# Single book +./bin/scraper run --url https://novelfire.net/book/xxx + +# HTTP server +./bin/scraper serve + +# Tests +cd scraper && go test ./... +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| BROWSERLESS_URL | Browserless Chrome endpoint | http://localhost:3000 | +| BROWSERLESS_STRATEGY | content \| scrape \| cdp | content | +| SCRAPER_WORKERS | Chapter goroutines | NumCPU | +| SCRAPER_STATIC_ROOT | Output directory | ./static/books | +| SCRAPER_HTTP_ADDR | HTTP listen address | :8080 | +| KOKORO_URL | Kokoro TTS endpoint | http://localhost:8880 | +| KOKORO_VOICE | Default TTS voice | af_bella | +| LOG_LEVEL | debug \| info \| warn \| error | info | + +## Docker + +```bash +docker-compose up -d # Starts browserless, kokoro, scraper +``` + +## Code Patterns + +- Uses `log/slog` for structured logging +- Context-based cancellation throughout +- Worker pool pattern in orchestrator (channel + goroutines) +- Mutex for single async job (409 on concurrent scrape requests) + +## AI Context Tips + +- Primary files to modify: `orchestrator.go`, `server.go`, `scraper.go`, `browser/*.go` +- To add new source: implement `NovelScraper` interface from `internal/scraper/interfaces.go` +- Skip `static/` directory - generated content, not source + +## Speed Up AI Sessions (Optional) + +For faster AI context loading, use **Context7** (free, local indexing): + +```bash +# Install and index once +npx @context7/cli@latest index --path . --ignore .aiignore + +# After first run, AI tools will query the index instead of re-scanning files +``` + +VSCode extension: https://marketplace.visualstudio.com/items?itemName=context7.context7 diff --git a/docker-compose.yml b/docker-compose.yml index 0f549db..0cddbcd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,8 @@ services: QUEUED: "${BROWSERLESS_QUEUED:-100}" # Per-session timeout in ms. TIMEOUT: "${BROWSERLESS_TIMEOUT:-60000}" + # Optional webhook URL for Browserless error alerts. + ERROR_ALERT_URL: "${ERROR_ALERT_URL:-}" ports: - "3000:3000" # Shared memory is required for Chrome. @@ -26,6 +28,21 @@ services: timeout: 5s retries: 5 + # ─── Kokoro-FastAPI (TTS) ──────────────────────────────────────────────────── + # CPU image; swap for ghcr.io/remsky/kokoro-fastapi-gpu:latest on NVIDIA hosts. + # Models are baked in — no volume mount required for the default voice set. + kokoro: + image: ghcr.io/remsky/kokoro-fastapi-cpu:latest + container_name: libnovel-kokoro + restart: unless-stopped + ports: + - "8880:8880" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8880/health"] + interval: 15s + timeout: 5s + retries: 5 + # ─── Scraper ───────────────────────────────────────────────────────────────── scraper: build: @@ -34,17 +51,23 @@ services: container_name: libnovel-scraper restart: unless-stopped depends_on: - browserless: + kokoro: condition: service_healthy environment: BROWSERLESS_URL: "http://browserless:3000" BROWSERLESS_TOKEN: "${BROWSERLESS_TOKEN:-}" - # content | scrape | cdp — swap to test different strategies. - BROWSERLESS_STRATEGY: "${BROWSERLESS_STRATEGY:-content}" + # content | scrape | cdp | direct — swap to test different strategies. + BROWSERLESS_STRATEGY: "${BROWSERLESS_STRATEGY:-direct}" + # Strategy for URL retrieval (chapter list). Default: content (browserless) + BROWSERLESS_URL_STRATEGY: "${BROWSERLESS_URL_STRATEGY:-content}" # 0 → defaults to NumCPU inside the container. SCRAPER_WORKERS: "${SCRAPER_WORKERS:-0}" SCRAPER_STATIC_ROOT: "/app/static/books" SCRAPER_HTTP_ADDR: ":8080" + LOG_LEVEL: "debug" + # Kokoro-FastAPI TTS endpoint. + KOKORO_URL: "${KOKORO_URL:-http://localhost:8880}" + KOKORO_VOICE: "${KOKORO_VOICE:-af_bella}" ports: - "8080:8080" volumes: diff --git a/scraper/cmd/scraper/main.go b/scraper/cmd/scraper/main.go index fee9203..02c4099 100644 --- a/scraper/cmd/scraper/main.go +++ b/scraper/cmd/scraper/main.go @@ -17,6 +17,8 @@ // SCRAPER_WORKERS Chapter goroutine count (default: NumCPU) // SCRAPER_STATIC_ROOT Output directory (default: ./static/books) // SCRAPER_HTTP_ADDR HTTP listen address (default: :8080) +// KOKORO_URL Kokoro-FastAPI base URL (default: "") +// KOKORO_VOICE Default TTS voice (default: af_bella) // LOG_LEVEL debug | info | warn | error (default: info) package main @@ -30,11 +32,13 @@ import ( "strconv" "strings" "syscall" + "time" "github.com/libnovel/scraper/internal/browser" "github.com/libnovel/scraper/internal/novelfire" "github.com/libnovel/scraper/internal/orchestrator" "github.com/libnovel/scraper/internal/server" + "github.com/libnovel/scraper/internal/writer" ) func main() { @@ -73,11 +77,18 @@ func run(log *slog.Logger) error { browserCfg.MaxConcurrent = n } } + if s := os.Getenv("BROWSERLESS_TIMEOUT"); s != "" { + if n, err := strconv.Atoi(s); err == nil && n > 0 { + browserCfg.Timeout = time.Duration(n) * time.Second + } + } - strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyContent)))) + strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyDirect)))) + urlStrategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_URL_STRATEGY", string(browser.StrategyContent)))) bc := newBrowserClient(strategy, browserCfg) + urlClient := newBrowserClient(urlStrategy, browserCfg) - nf := novelfire.New(bc, log) + nf := novelfire.New(bc, log, urlClient) workers := 0 if s := os.Getenv("SCRAPER_WORKERS"); s != "" { @@ -114,15 +125,44 @@ func run(log *slog.Logger) error { o := orchestrator.New(oCfg, nf, log) return o.Run(ctx) + case "refresh": + // refresh - re-scrape a book from its saved source_url + if len(args) < 2 { + return fmt.Errorf("refresh command requires a book slug argument") + } + slug := args[1] + w := writer.New(oCfg.StaticRoot) + meta, ok, err := w.ReadMetadata(slug) + if err != nil { + return fmt.Errorf("failed to read metadata for %s: %w", slug, err) + } + if !ok { + return fmt.Errorf("book %q not found in %s", slug, oCfg.StaticRoot) + } + if meta.SourceURL == "" { + return fmt.Errorf("book %q has no source_url in metadata", slug) + } + oCfg.SingleBookURL = meta.SourceURL + log.Info("refreshing book from source_url", + "slug", slug, + "source_url", meta.SourceURL, + ) + o := orchestrator.New(oCfg, nf, log) + return o.Run(ctx) + case "serve": addr := envOr("SCRAPER_HTTP_ADDR", ":8080") + kokoroURL := envOr("KOKORO_URL", "") + kokoroVoice := envOr("KOKORO_VOICE", "af_bella") log.Info("starting HTTP server", "addr", addr, "strategy", strategy, "workers", workers, "max_concurrent", browserCfg.MaxConcurrent, + "kokoro_url", kokoroURL, + "kokoro_voice", kokoroVoice, ) - srv := server.New(addr, oCfg, nf, log) + srv := server.New(addr, oCfg, nf, log, kokoroURL, kokoroVoice) return srv.ListenAndServe(ctx) default: @@ -136,6 +176,8 @@ func newBrowserClient(strategy browser.Strategy, cfg browser.Config) browser.Bro return browser.NewScrapeClient(cfg) case browser.StrategyCDP: return browser.NewCDPClient(cfg) + case browser.StrategyDirect: + return browser.NewDirectHTTPClient(cfg) default: return browser.NewContentClient(cfg) } @@ -153,16 +195,21 @@ func printUsage() { Commands: run [--url ] One-shot: scrape full catalogue, or a single book + refresh Re-scrape a book from its saved source_url serve Start HTTP server (POST /scrape, POST /scrape/book) Environment variables: BROWSERLESS_URL Browserless base URL (default: http://localhost:3000) BROWSERLESS_TOKEN API token (default: "") - BROWSERLESS_STRATEGY content | scrape | cdp (default: content) + BROWSERLESS_STRATEGY content|scrape|cdp|direct (default: direct) + BROWSERLESS_URL_STRATEGY Strategy for URL retrieval (default: content) BROWSERLESS_MAX_CONCURRENT Max simultaneous sessions (default: 5) + BROWSERLESS_TIMEOUT HTTP request timeout sec (default: 90) SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d) SCRAPER_STATIC_ROOT Output directory (default: ./static/books) SCRAPER_HTTP_ADDR HTTP listen address (default: :8080) + KOKORO_URL Kokoro-FastAPI base URL (default: "", TTS disabled) + KOKORO_VOICE Default TTS voice (default: af_bella) LOG_LEVEL debug|info|warn|error (default: info) `, runtime.NumCPU()) } diff --git a/scraper/internal/browser/content_scrape.go b/scraper/internal/browser/content_scrape.go index 35eab3c..3e2a38c 100644 --- a/scraper/internal/browser/content_scrape.go +++ b/scraper/internal/browser/content_scrape.go @@ -65,7 +65,7 @@ type contentClient struct { // NewContentClient returns a BrowserClient that uses POST /content. func NewContentClient(cfg Config) BrowserClient { if cfg.Timeout == 0 { - cfg.Timeout = 60 * time.Second + cfg.Timeout = 90 * time.Second } return &contentClient{ cfg: cfg, @@ -135,7 +135,7 @@ type scrapeClient struct { // NewScrapeClient returns a BrowserClient that uses POST /scrape. func NewScrapeClient(cfg Config) BrowserClient { if cfg.Timeout == 0 { - cfg.Timeout = 60 * time.Second + cfg.Timeout = 90 * time.Second } return &scrapeClient{ cfg: cfg, diff --git a/scraper/internal/browser/http.go b/scraper/internal/browser/http.go new file mode 100644 index 0000000..5b9c374 --- /dev/null +++ b/scraper/internal/browser/http.go @@ -0,0 +1,68 @@ +package browser + +import ( + "context" + "fmt" + "io" + "net/http" + "time" +) + +type httpClient struct { + cfg Config + http *http.Client + sem chan struct{} +} + +func NewDirectHTTPClient(cfg Config) BrowserClient { + if cfg.Timeout == 0 { + cfg.Timeout = 30 * time.Second + } + return &httpClient{ + cfg: cfg, + http: &http.Client{Timeout: cfg.Timeout}, + sem: makeSem(cfg.MaxConcurrent), + } +} + +func (c *httpClient) Strategy() Strategy { return StrategyDirect } + +func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string, error) { + if err := acquire(ctx, c.sem); err != nil { + return "", fmt.Errorf("http: semaphore: %w", err) + } + defer release(c.sem) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, req.URL, nil) + if err != nil { + return "", fmt.Errorf("http: build request: %w", err) + } + httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5") + + resp, err := c.http.Do(httpReq) + if err != nil { + return "", fmt.Errorf("http: do request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b) + } + + raw, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("http: read body: %w", err) + } + return string(raw), nil +} + +func (c *httpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) { + return ScrapeResponse{}, fmt.Errorf("http client does not support ScrapePage; use browserless") +} + +func (c *httpClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error { + return fmt.Errorf("http client does not support CDP; use browserless") +} diff --git a/scraper/internal/browser/integration_test.go b/scraper/internal/browser/integration_test.go new file mode 100644 index 0000000..e008803 --- /dev/null +++ b/scraper/internal/browser/integration_test.go @@ -0,0 +1,152 @@ +//go:build integration + +// Integration tests for the Browserless /content API. +// +// These tests require a live Browserless instance and are gated behind the +// "integration" build tag so they never run in normal `go test ./...` passes. +// +// Run them with: +// +// BROWSERLESS_URL=http://localhost:3000 \ +// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled +// go test -v -tags integration -timeout 120s \ +// github.com/libnovel/scraper/internal/browser +package browser_test + +import ( + "context" + "os" + "strings" + "testing" + "time" + + "github.com/libnovel/scraper/internal/browser" +) + +// chapterURL is the novelfire chapter used in every integration sub-test. +const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1" + +// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the +// environment and returns a configured contentClient. +// The test is skipped when BROWSERLESS_URL is not set. +func newIntegrationClient(t *testing.T) browser.BrowserClient { + t.Helper() + baseURL := os.Getenv("BROWSERLESS_URL") + if baseURL == "" { + t.Skip("BROWSERLESS_URL not set — skipping integration test") + } + return browser.NewContentClient(browser.Config{ + BaseURL: baseURL, + Token: os.Getenv("BROWSERLESS_TOKEN"), + // Use a generous per-request HTTP timeout so the wait-for-selector + // (75 s) doesn't get cut off by the transport layer. + Timeout: 120 * time.Second, + MaxConcurrent: 1, + }) +} + +// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content +// request with the production wait-for-selector settings succeeds and that the +// returned HTML contains the #content div expected on novelfire chapter pages. +func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) { + client := newIntegrationClient(t) + + ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second) + defer cancel() + + req := browser.ContentRequest{ + URL: chapterURL, + WaitFor: &browser.WaitForSelector{ + Selector: "#content", + Timeout: 5000, + }, + RejectResourceTypes: productionRejectTypes(), + } + + html, err := client.GetContent(ctx, req) + if err != nil { + t.Fatalf("GetContent failed: %v", err) + } + + // The #content div must not be empty; presence of

tags inside it is a + // reliable indicator that chapter paragraphs were rendered. + contentIdx := strings.Index(html, `id="content"`) + if contentIdx == -1 { + t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html)) + } + + // Look for

tags after the #content marker — the chapter text lives there. + afterContent := html[contentIdx:] + if !strings.Contains(afterContent, " tags; JS rendering may have failed.\nSection preview:\n%s", + truncate(afterContent, 1000)) + } + + t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html)) +} + +// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a +// deliberately too-short timeout returns an error containing "TimeoutError" (the +// Browserless error string seen in the failing log entry). This ensures our +// error-classification logic in retryGetContent matches real Browserless output. +func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) { + client := newIntegrationClient(t) + + ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second) + defer cancel() + + req := browser.ContentRequest{ + URL: chapterURL, + WaitFor: &browser.WaitForSelector{ + Selector: "#content", + Timeout: 500, // intentionally too short (500 ms) → Browserless will time out + }, + RejectResourceTypes: productionRejectTypes(), + } + + _, err := client.GetContent(ctx, req) + if err == nil { + t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " + + "the page may now load very fast; adjust the timeout threshold") + } + + t.Logf("got expected error: %v", err) + + // Browserless wraps navigation timeouts in a 500 response with + // "TimeoutError: Navigation timeout" in the body — this is the exact + // error that is triggering retries in production. + if !strings.Contains(err.Error(), "500") { + t.Errorf("expected HTTP 500 status in error, got: %v", err) + } +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +// productionRejectTypes returns the same resource-type block-list the +// novelfire scraper uses in production, so integration tests exercise the +// identical request shape. +func productionRejectTypes() []string { + return []string{ + "cspviolationreport", + "eventsource", + "fedcm", + "font", + "image", + "manifest", + "media", + "other", + "ping", + "signedexchange", + "stylesheet", + "texttrack", + "websocket", + } +} + +// truncate returns the first n bytes of s as a string. +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "…" +} diff --git a/scraper/internal/browser/interface.go b/scraper/internal/browser/interface.go index aa78212..c2c8fd4 100644 --- a/scraper/internal/browser/interface.go +++ b/scraper/internal/browser/interface.go @@ -21,6 +21,10 @@ const ( // DevTools Protocol). Most powerful; required for complex interactions // (clicking, scrolling, waiting for network idle, etc.). StrategyCDP Strategy = "cdp" + + // StrategyDirect uses a plain HTTP client to fetch HTML directly. + // Suitable for sites that don't require JavaScript rendering. + StrategyDirect Strategy = "direct" ) // WaitForSelector describes the waitForSelector option sent to Browserless. @@ -29,12 +33,20 @@ type WaitForSelector struct { Timeout int `json:"timeout,omitempty"` // ms } +// GotoOptions controls page navigation behavior. +type GotoOptions struct { + Timeout int `json:"timeout,omitempty"` // ms + WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load" +} + // ContentRequest is the body sent to POST /content. type ContentRequest struct { URL string `json:"url"` WaitFor *WaitForSelector `json:"waitForSelector,omitempty"` WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"] + GotoOptions *GotoOptions `json:"gotoOptions,omitempty"` + BestAttempt bool `json:"bestAttempt,omitempty"` // return partial content on timeout/error } // ScrapeElement is one element descriptor inside a ScrapeRequest. @@ -45,9 +57,10 @@ type ScrapeElement struct { // ScrapeRequest is the body sent to POST /scrape. type ScrapeRequest struct { - URL string `json:"url"` - Elements []ScrapeElement `json:"elements"` - WaitFor *WaitForSelector `json:"waitForSelector,omitempty"` + URL string `json:"url"` + Elements []ScrapeElement `json:"elements"` + WaitFor *WaitForSelector `json:"waitForSelector,omitempty"` + GotoOptions *GotoOptions `json:"gotoOptions,omitempty"` } // ScrapeResult is one entry in the response from POST /scrape. diff --git a/scraper/internal/novelfire/integration_test.go b/scraper/internal/novelfire/integration_test.go new file mode 100644 index 0000000..98af698 --- /dev/null +++ b/scraper/internal/novelfire/integration_test.go @@ -0,0 +1,344 @@ +//go:build integration + +// Integration tests for the novelfire.net Scraper against a live Browserless instance. +// +// These tests exercise the full scraping stack — Browserless → raw HTML → +// novelfire HTML parser — for the book: +// +// https://novelfire.net/book/a-dragon-against-the-whole-world +// +// They are gated behind the "integration" build tag so they never run in a +// normal `go test ./...` pass. +// +// Run with: +// +// BROWSERLESS_URL=http://localhost:3000 \ +// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled +// go test -v -tags integration -timeout 600s \ +// github.com/libnovel/scraper/internal/novelfire +package novelfire + +import ( + "context" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/libnovel/scraper/internal/browser" + "github.com/libnovel/scraper/internal/scraper" +) + +const ( + integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world" + integrationBookSlug = "a-dragon-against-the-whole-world" + integrationBookTitle = "A Dragon against the Whole World" +) + +// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the +// environment, constructs a real contentClient, and returns a novelfire Scraper +// wired to it. The test is skipped when BROWSERLESS_URL is not set. +func newIntegrationScraper(t *testing.T) *Scraper { + t.Helper() + baseURL := os.Getenv("BROWSERLESS_URL") + if baseURL == "" { + t.Skip("BROWSERLESS_URL not set — skipping integration test") + } + client := browser.NewContentClient(browser.Config{ + BaseURL: baseURL, + Token: os.Getenv("BROWSERLESS_TOKEN"), + Timeout: 120 * time.Second, + MaxConcurrent: 1, + }) + return New(client, nil) +} + +// ── Metadata ────────────────────────────────────────────────────────────────── + +// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that +// ScrapeMetadata fetches the book page and correctly parses at minimum +// the slug, title, and source URL. +func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) { + s := newIntegrationScraper(t) + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + meta, err := s.ScrapeMetadata(ctx, integrationBookURL) + if err != nil { + t.Fatalf("ScrapeMetadata failed: %v", err) + } + + t.Logf("slug: %s", meta.Slug) + t.Logf("title: %s", meta.Title) + t.Logf("author: %s", meta.Author) + t.Logf("status: %s", meta.Status) + t.Logf("genres: %v", meta.Genres) + t.Logf("total_chapters: %d", meta.TotalChapters) + t.Logf("source_url: %s", meta.SourceURL) + + if meta.Slug != integrationBookSlug { + t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug) + } + if meta.Title == "" { + t.Error("title is empty") + } + if !strings.EqualFold(meta.Title, integrationBookTitle) { + // Warn rather than hard-fail — the site may reword the title. + t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle) + } + if meta.SourceURL != integrationBookURL { + t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL) + } +} + +// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that +// every optional field (author, status, genres, summary, total_chapters) is +// populated. A missing field is a warning, not a hard failure, because the +// site may change its HTML structure. +func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) { + s := newIntegrationScraper(t) + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + meta, err := s.ScrapeMetadata(ctx, integrationBookURL) + if err != nil { + t.Fatalf("ScrapeMetadata failed: %v", err) + } + + type check struct { + field string + empty bool + } + checks := []check{ + {"author", meta.Author == ""}, + {"status", meta.Status == ""}, + {"summary", meta.Summary == ""}, + {"genres", len(meta.Genres) == 0}, + {"total_chapters", meta.TotalChapters == 0}, + } + for _, c := range checks { + if c.empty { + t.Errorf("field %q is empty — HTML selector may have broken", c.field) + } + } + + // total_chapters must be a positive integer. + if meta.TotalChapters < 1 { + t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters) + } +} + +// ── Chapter list ────────────────────────────────────────────────────────────── + +// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that +// ScrapeChapterList returns a non-empty slice of chapter references with +// valid URLs and numbers parsed from those URLs (not list position). +func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) { + s := newIntegrationScraper(t) + + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + refs, err := s.ScrapeChapterList(ctx, integrationBookURL) + if err != nil { + t.Fatalf("ScrapeChapterList failed: %v", err) + } + + t.Logf("total refs returned: %d", len(refs)) + + if len(refs) == 0 { + t.Fatal("ScrapeChapterList returned 0 refs") + } + + // Every ref must have a non-empty URL pointing at the correct book. + for i, ref := range refs { + if ref.URL == "" { + t.Errorf("refs[%d].URL is empty", i) + } + if !strings.Contains(ref.URL, integrationBookSlug) { + t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL) + } + if ref.Number <= 0 { + t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL) + } + if ref.Title == "" { + t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL) + } + } +} + +// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the +// fix for the newest-first ordering bug: each ref's Number must equal the +// chapter number embedded in its URL, not its position in the list. +func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) { + s := newIntegrationScraper(t) + + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + refs, err := s.ScrapeChapterList(ctx, integrationBookURL) + if err != nil { + t.Fatalf("ScrapeChapterList failed: %v", err) + } + if len(refs) == 0 { + t.Fatal("ScrapeChapterList returned 0 refs") + } + + mismatches := 0 + for i, ref := range refs { + wantNum := chapterNumberFromURL(ref.URL) + if wantNum <= 0 { + // URL has no parseable number — skip this entry. + continue + } + if ref.Number != wantNum { + t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)", + i, ref.Number, ref.URL, wantNum) + mismatches++ + if mismatches >= 5 { + t.Log("… (further mismatches suppressed)") + break + } + } + } + + // Log the first few refs so failures are easy to diagnose. + limit := 5 + if len(refs) < limit { + limit = len(refs) + } + for i := 0; i < limit; i++ { + t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL) + } +} + +// ── Chapters ────────────────────────────────────────────────────────────────── + +// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3 +// via ScrapeChapterText and verifies each returns non-empty markdown text. +// Chapters are run as sub-tests so a single failure does not abort the others. +func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) { + s := newIntegrationScraper(t) + + chapters := []scraper.ChapterRef{ + { + Number: 1, + Title: "Chapter 1", + URL: integrationBookURL + "/chapter-1", + }, + { + Number: 2, + Title: "Chapter 2", + URL: integrationBookURL + "/chapter-2", + }, + { + Number: 3, + Title: "Chapter 3", + URL: integrationBookURL + "/chapter-3", + }, + } + + for _, ref := range chapters { + ref := ref // capture + t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) { + // Sequential: each chapter needs its own generous timeout. + ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second) + defer cancel() + + ch, err := s.ScrapeChapterText(ctx, ref) + if err != nil { + t.Fatalf("ScrapeChapterText failed: %v", err) + } + + t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text)) + t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300)) + + // Ref fields must be echoed back unchanged. + if ch.Ref.Number != ref.Number { + t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number) + } + if ch.Ref.URL != ref.URL { + t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL) + } + + // Text must be non-trivially long. + if len(ch.Text) < 100 { + t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s", + len(ch.Text), ch.Text) + } + + // Text must not contain raw HTML tags — NodeToMarkdown should have + // stripped them. + for _, tag := range []string{"Author Name author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"}) - // - cover := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "img", Class: "cover", Attr: "src"}) + //

+ var cover string + if figureCover := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); figureCover != nil { + cover = htmlutil.ExtractFirst(figureCover, scraper.Selector{Tag: "img", Attr: "src"}) + } // Ongoing status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"}) @@ -245,32 +252,41 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) { var refs []scraper.ChapterRef - // Chapter list URL: {bookURL}/chapters - pageURL := strings.TrimRight(bookURL, "/") + "/chapters" + // Chapter list URL: {bookURL}/chapters?page=N + baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters" page := 1 - for pageURL != "" { + for { select { case <-ctx.Done(): return refs, ctx.Err() default: } + pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page) s.log.Info("scraping chapter list", "page", page, "url", pageURL) s.log.Debug("chapter list fetch starting", "page", page, "payload_url", pageURL, "payload_wait_selector", ".chapter-list", - "payload_wait_selector_timeout_ms", 10000, - "payload_wait_for_timeout_ms", 10000, + "payload_wait_selector_timeout_ms", 15000, + "payload_wait_timeout_ms", 2000, + "strategy", s.urlClient.Strategy(), ) - raw, err := s.client.GetContent(ctx, browser.ContentRequest{ - URL: pageURL, - WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 10000}, - WaitForTimeout: 10000, + raw, err := s.urlClient.GetContent(ctx, browser.ContentRequest{ + URL: pageURL, + // Wait up to 15 s for the chapter list container to appear in the DOM. + WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 15000}, + // After the selector is found, wait an additional 2 s for any + // deferred JS rendering (lazy-loaded links, infinite-scroll hydration). + WaitForTimeout: 2000, RejectResourceTypes: rejectResourceTypes, + GotoOptions: &browser.GotoOptions{Timeout: 60000}, + // Do NOT use BestAttempt — we want a complete page or a clear error, + // not silently partial HTML that looks like "no more chapters". + BestAttempt: false, }) if err != nil { s.log.Debug("chapter list fetch failed", @@ -293,10 +309,27 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"}) if chapterList == nil { + // No chapter list container on this page — we've gone past the last page. + s.log.Debug("chapter list container not found, stopping pagination", "page", page) break } + // Each chapter row:
  • Title
  • items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"}) + + s.log.Debug("chapter list page parsed", + "page", page, + "url", pageURL, + "chapters_on_page", len(items), + "total_refs_so_far", len(refs), + ) + + // Zero items on this page means we've gone past the last page. + if len(items) == 0 { + s.log.Debug("no chapters on page, stopping pagination", "page", page) + break + } + for _, item := range items { linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"}) if linkNode == nil { @@ -308,7 +341,15 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra continue } chURL := resolveURL(baseURL, href) - num := len(refs) + 1 + num := chapterNumberFromURL(chURL) + if num <= 0 { + // Fall back to position if the URL has no parseable number. + num = len(refs) + 1 + s.log.Warn("chapter number not parseable from URL, falling back to position", + "url", chURL, + "position", num, + ) + } refs = append(refs, scraper.ChapterRef{ Number: num, Title: strings.TrimSpace(chTitle), @@ -316,30 +357,134 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra }) } - s.log.Debug("chapter list page parsed", - "page", page, - "url", pageURL, - "chapters_on_page", len(items), - "total_refs_so_far", len(refs), - ) - - // Next page:
    + var cover string + if fig := htmlutil.FindFirst(item, scraper.Selector{Tag: "figure", Class: "cover"}); fig != nil { + cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"}) + if cover != "" { + cover = baseURL + cover + } + } + + // Title and URL:

    Title

    + titleNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "h2", Class: "title"}) + var title, bookURL string + if titleNode != nil { + linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a"}) + if linkNode != nil { + title = htmlutil.ExtractText(linkNode, scraper.Selector{}) + href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"}) + bookURL = resolveURL(baseURL, href) + } + } + + // Status: Ongoing/Completed + status := htmlutil.ExtractFirst(item, scraper.Selector{Tag: "span", Class: "status"}) + + // Genres:
    Genre1Genre2...
    + var genres []string + categoriesNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "div", Class: "categories"}) + if categoriesNode != nil { + genres = htmlutil.ExtractAll(categoriesNode, scraper.Selector{Tag: "span", Multiple: true}) + } + + slug := slugFromURL(bookURL) + + meta := scraper.BookMeta{ + Slug: slug, + Title: title, + Cover: cover, + Status: strings.TrimSpace(status), + Genres: genres, + SourceURL: bookURL, + Ranking: rank, + } + rank++ + + select { + case <-ctx.Done(): + return + case entries <- meta: + } + } + + // Next page - ranking pages use different pagination, just get first page for now + break + } + }() + + return entries, errs +} + // ─── ChapterTextProvider ───────────────────────────────────────────────────── // retryGetContent calls client.GetContent up to maxAttempts times, backing off // exponentially between retries. Only errors that look like transient Browserless -// 5xx responses (navigation timeouts, etc.) are retried; context cancellation and +// failures (timeouts, 5xx responses) are retried; context cancellation and // permanent errors are returned immediately. func retryGetContent( ctx context.Context, @@ -363,11 +508,6 @@ func retryGetContent( return "", err } - // Only retry on Browserless 5xx responses. - if !strings.Contains(err.Error(), "unexpected status 5") { - return "", err - } - if attempt < maxAttempts { log.Warn("chapter fetch failed, retrying", "url", req.URL, @@ -393,15 +533,15 @@ func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) "title", ref.Title, "payload_url", ref.URL, "payload_wait_selector", "#content", - "payload_wait_selector_timeout_ms", 75000, - "payload_wait_for_timeout_ms", 75000, + "payload_wait_selector_timeout_ms", 5000, ) raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{ URL: ref.URL, - WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 75000}, - WaitForTimeout: 75000, + WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 5000}, RejectResourceTypes: rejectResourceTypes, + GotoOptions: &browser.GotoOptions{Timeout: 60000}, + BestAttempt: true, }, 9, 6*time.Second) if err != nil { s.log.Debug("chapter text fetch failed", @@ -411,6 +551,18 @@ func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) ) return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err) } + if len(raw) > 0 { + preview := raw + if len(preview) > 500 { + preview = preview[:500] + } + s.log.Debug("chapter text fetch partial content", + "chapter", ref.Number, + "url", ref.URL, + "response_bytes", len(raw), + "preview", preview, + ) + } s.log.Debug("chapter text fetch completed", "chapter", ref.Number, "url", ref.URL, @@ -484,3 +636,30 @@ func parseChapterCount(s string) int { n, _ := strconv.Atoi(fields[0]) return n } + +// chapterNumberFromURL extracts the chapter number from a novelfire chapter URL. +// +// URL pattern: https://novelfire.net/book/{book-slug}/chapter-{N} +// The last path segment is expected to be "chapter-{N}" or "{N}". +// Returns 0 if no number can be parsed. +func chapterNumberFromURL(chapterURL string) int { + u, err := url.Parse(chapterURL) + if err != nil { + return 0 + } + seg := path.Base(u.Path) // e.g. "chapter-42" or "42" + // Strip a "chapter-" prefix if present. + seg = strings.TrimPrefix(seg, "chapter-") + // Also handle "chap-", "ch-" variants used by some sites. + seg = strings.TrimPrefix(seg, "chap-") + seg = strings.TrimPrefix(seg, "ch-") + // Take only the leading digits (handles slugs like "42-title-text"). + digits := strings.FieldsFunc(seg, func(r rune) bool { + return r < '0' || r > '9' + }) + if len(digits) == 0 { + return 0 + } + n, _ := strconv.Atoi(digits[0]) + return n +} diff --git a/scraper/internal/novelfire/scraper_test.go b/scraper/internal/novelfire/scraper_test.go new file mode 100644 index 0000000..7d94e6b --- /dev/null +++ b/scraper/internal/novelfire/scraper_test.go @@ -0,0 +1,217 @@ +package novelfire + +import ( + "context" + "strings" + "testing" + + "github.com/libnovel/scraper/internal/browser" + "github.com/libnovel/scraper/internal/scraper" +) + +// ── stub browser client ─────────────────────────────────────────────────────── + +// stubClient is a BrowserClient that returns a fixed HTML string for every +// GetContent call. ScrapePage and CDPSession are not used by these tests. +type stubClient struct { + html string +} + +func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent } + +func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) { + return s.html, nil +} + +func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) { + return browser.ScrapeResponse{}, nil +} + +func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error { + return nil +} + +// pagedStubClient returns a different HTML response for each successive call. +// Once all pages are exhausted it returns an empty page (no chapter-list), +// simulating the paginated chapter-list endpoint terminating correctly. +type pagedStubClient struct { + pages []string + call int +} + +func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent } + +func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) { + if c.call < len(c.pages) { + html := c.pages[c.call] + c.call++ + return html, nil + } + // Past the last page — return a page with no chapter-list to stop pagination. + return `
    `, nil +} + +func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) { + return browser.ScrapeResponse{}, nil +} + +func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error { + return nil +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +func newScraper(html string) *Scraper { + return New(&stubClient{html: html}, nil, &stubClient{html: html}) +} + +func newPagedScraper(pages ...string) *Scraper { + urlClient := &pagedStubClient{pages: pages} + return New(&stubClient{}, nil, urlClient) +} + +// ── ScrapeChapterText ───────────────────────────────────────────────────────── + +func TestScrapeChapterText_ExtractsInnerText(t *testing.T) { + html := ` +
    +

    It was a dark and stormy night.

    +

    The hero stepped forward.

    +
    + ` + + s := newScraper(html) + ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"} + + ch, err := s.ScrapeChapterText(context.Background(), ref) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ch.Ref.Number != 1 { + t.Errorf("expected chapter number 1, got %d", ch.Ref.Number) + } + if !strings.Contains(ch.Text, "dark and stormy") { + t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text) + } + if !strings.Contains(ch.Text, "hero stepped forward") { + t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text) + } +} + +func TestScrapeChapterText_MissingContainer(t *testing.T) { + html := `
    nothing here
    ` + + s := newScraper(html) + ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"} + + _, err := s.ScrapeChapterText(context.Background(), ref) + if err == nil { + t.Fatal("expected an error when #content container is missing, got nil") + } +} + +// ── chapterNumberFromURL ────────────────────────────────────────────────────── + +func TestChapterNumberFromURL(t *testing.T) { + cases := []struct { + url string + want int + }{ + // Standard novelfire pattern. + {"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1}, + {"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26}, + {"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58}, + // Large chapter numbers. + {"https://novelfire.net/book/some-novel/chapter-1000", 1000}, + // Path segment with trailing slash. + {"https://novelfire.net/book/some-novel/chapter-5/", 5}, + // Slug with title appended after the number (hypothetical future format). + {"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42}, + // Unparseable — should return 0 so the caller can fall back. + {"https://novelfire.net/book/some-novel/prologue", 0}, + {"https://novelfire.net/book/some-novel/", 0}, + {"not-a-url", 0}, + } + + for _, tc := range cases { + got := chapterNumberFromURL(tc.url) + if got != tc.want { + t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want) + } + } +} + +// ── ScrapeChapterList (position vs URL numbering) ───────────────────────────── + +// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML +// is served newest-first (as novelfire.net does), chapter numbers are still +// assigned from the URL — not from list position — so that a re-run correctly +// identifies which chapters are already on disk. +func TestScrapeChapterList_NumbersFromURL(t *testing.T) { + // Simulate a newest-first chapter list with 5 chapters on a single page. + // Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML. + page1 := ` + + ` + + s := newPagedScraper(page1) + refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(refs) != 5 { + t.Fatalf("expected 5 refs, got %d", len(refs)) + } + + // With position-based numbering (the old bug), refs[0].Number would be 1 + // even though its URL is /chapter-5. With URL-based numbering it must be 5. + wantNumbers := []int{5, 4, 3, 2, 1} + for i, ref := range refs { + if ref.Number != wantNumbers[i] { + t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL) + } + } +} + +// TestScrapeChapterList_Pagination verifies that the scraper correctly follows +// ?page=N pagination and stops when a page returns no chapter items. +func TestScrapeChapterList_Pagination(t *testing.T) { + page1 := ` + + ` + + page2 := ` + + ` + + // page3 is omitted — pagedStubClient will return empty page to stop pagination. + s := newPagedScraper(page1, page2) + refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(refs) != 6 { + t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs)) + } + + wantNumbers := []int{3, 2, 1, 6, 5, 4} + for i, ref := range refs { + if ref.Number != wantNumbers[i] { + t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL) + } + } +} diff --git a/scraper/internal/scraper/interfaces.go b/scraper/internal/scraper/interfaces.go index cac8367..d98ec6a 100644 --- a/scraper/internal/scraper/interfaces.go +++ b/scraper/internal/scraper/interfaces.go @@ -27,6 +27,8 @@ type BookMeta struct { TotalChapters int `yaml:"total_chapters,omitempty"` // SourceURL is the canonical URL of the book's landing page. SourceURL string `yaml:"source_url"` + // Ranking is the rank number from ranking pages. + Ranking int `yaml:"ranking,omitempty"` } // CatalogueEntry is a lightweight reference returned by CatalogueProvider. @@ -108,6 +110,13 @@ type ChapterTextProvider interface { ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error) } +// RankingProvider can enumerate novels from a ranking page. +type RankingProvider interface { + // ScrapeRanking pages through the ranking list, sending BookMeta values + // (with basic info like title, cover, genres, status, sourceURL) to the returned channel. + ScrapeRanking(ctx context.Context) (<-chan BookMeta, <-chan error) +} + // NovelScraper is the full interface that a concrete novel source must implement. // It composes all four provider interfaces. type NovelScraper interface { @@ -115,6 +124,7 @@ type NovelScraper interface { MetadataProvider ChapterListProvider ChapterTextProvider + RankingProvider // SourceName returns the human-readable name of this scraper, e.g. "novelfire.net". SourceName() string diff --git a/scraper/internal/server/server.go b/scraper/internal/server/server.go index 0ae7d26..68105ae 100644 --- a/scraper/internal/server/server.go +++ b/scraper/internal/server/server.go @@ -13,6 +13,7 @@ import ( "fmt" "log/slog" "net/http" + "strconv" "sync" "time" @@ -23,23 +24,27 @@ import ( // Server wraps an HTTP mux with the scraping endpoints. type Server struct { - addr string - oCfg orchestrator.Config - novel scraper.NovelScraper - log *slog.Logger - writer *writer.Writer - mu sync.Mutex - running bool + addr string + oCfg orchestrator.Config + novel scraper.NovelScraper + log *slog.Logger + writer *writer.Writer + mu sync.Mutex + running bool + kokoroURL string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880 + kokoroVoice string // default voice, e.g. af_bella } // New creates a new Server. -func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger) *Server { +func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger, kokoroURL, kokoroVoice string) *Server { return &Server{ - addr: addr, - oCfg: oCfg, - novel: novel, - log: log, - writer: writer.New(oCfg.StaticRoot), + addr: addr, + oCfg: oCfg, + novel: novel, + log: log, + writer: writer.New(oCfg.StaticRoot), + kokoroURL: kokoroURL, + kokoroVoice: kokoroVoice, } } @@ -52,16 +57,21 @@ func (s *Server) ListenAndServe(ctx context.Context) error { mux.HandleFunc("POST /scrape/book", s.handleScrapeBook) // UI routes mux.HandleFunc("GET /", s.handleHome) + mux.HandleFunc("GET /ranking", s.handleRanking) + mux.HandleFunc("POST /ranking/refresh", s.handleRankingRefresh) + mux.HandleFunc("GET /ranking/view", s.handleRankingView) mux.HandleFunc("GET /books/{slug}", s.handleBook) mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter) mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook) mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus) + // Plain-text chapter content for browser-side TTS + mux.HandleFunc("GET /ui/chapter-text/{slug}/{n}", s.handleChapterText) srv := &http.Server{ Addr: s.addr, Handler: mux, ReadTimeout: 15 * time.Second, - WriteTimeout: 15 * time.Second, + WriteTimeout: 60 * time.Second, IdleTimeout: 60 * time.Second, } @@ -85,6 +95,25 @@ func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) { _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) } +// handleChapterText returns the plain text of a chapter (markdown stripped) +// for browser-side TTS. The browser POSTs this directly to Kokoro-FastAPI. +func (s *Server) handleChapterText(w http.ResponseWriter, r *http.Request) { + slug := r.PathValue("slug") + n, err := strconv.Atoi(r.PathValue("n")) + if err != nil || n < 1 { + http.NotFound(w, r) + return + } + raw, err := s.writer.ReadChapter(slug, n) + if err != nil { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + fmt.Fprint(w, stripMarkdown(raw)) +} + func (s *Server) handleScrapeCatalogue(w http.ResponseWriter, r *http.Request) { cfg := s.oCfg cfg.SingleBookURL = "" // full catalogue diff --git a/scraper/internal/server/ui.go b/scraper/internal/server/ui.go new file mode 100644 index 0000000..b9565b0 --- /dev/null +++ b/scraper/internal/server/ui.go @@ -0,0 +1,1331 @@ +package server + +import ( + "bytes" + "context" + "fmt" + "html/template" + "net/http" + "regexp" + "strconv" + "strings" + "time" + + "github.com/libnovel/scraper/internal/orchestrator" + "github.com/libnovel/scraper/internal/writer" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/extension" + goldhtml "github.com/yuin/goldmark/renderer/html" +) + +// md is the shared goldmark instance used for all markdown→HTML conversions. +var md = goldmark.New( + goldmark.WithExtensions(extension.Typographer, extension.Table), + goldmark.WithRendererOptions(goldhtml.WithUnsafe()), +) + +// kokoroVoices is the full list of voices shipped with Kokoro-FastAPI, +// grouped loosely by language prefix: +// +// af_ / am_ American English female / male +// bf_ / bm_ British English female / male +// ef_ / em_ Spanish female / male +// ff_ French female +// hf_ / hm_ Hindi female / male +// if_ / im_ Italian female / male +// jf_ / jm_ Japanese female / male +// pf_ / pm_ Portuguese female / male +// zf_ / zm_ Chinese female / male +var kokoroVoices = []string{ + // American English + "af_alloy", "af_aoede", "af_bella", "af_heart", "af_jadzia", + "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", + "af_sarah", "af_sky", + "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", + "am_michael", "am_onyx", "am_puck", + // British English + "bf_alice", "bf_emma", "bf_lily", + "bm_daniel", "bm_fable", "bm_george", "bm_lewis", + // Spanish + "ef_dora", "em_alex", + // French + "ff_siwis", + // Hindi + "hf_alpha", "hf_beta", "hm_omega", "hm_psi", + // Italian + "if_sara", "im_nicola", + // Japanese + "jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo", + // Portuguese + "pf_dora", "pm_alex", + // Chinese + "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi", + "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang", +} + +// ─── shared layout ──────────────────────────────────────────────────────────── + +const layoutHead = ` + + + + + {{.Title}} — libnovel + + + + +` + +const layoutFoot = `` + +func renderPage(w http.ResponseWriter, title, body string) { + t := template.Must(template.New("layout").Parse(layoutHead + body + layoutFoot)) + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _ = t.Execute(w, struct{ Title string }{Title: title}) +} + +func renderFragment(w http.ResponseWriter, body string) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + fmt.Fprint(w, body) +} + +func isHTMX(r *http.Request) bool { + return r.Header.Get("HX-Request") == "true" +} + +// respond writes either a full page or an HTMX fragment depending on the request. +func (s *Server) respond(w http.ResponseWriter, r *http.Request, title, fragment string) { + if isHTMX(r) { + renderFragment(w, fragment) + return + } + renderPage(w, title, + `
    `+fragment+`
    `) +} + +// ─── GET / — book catalogue ─────────────────────────────────────────────────── + +const homeTmpl = ` +
    +
    +

    libnovel

    + Browse Rankings +
    +

    {{len .Books}} book{{if ne (len .Books) 1}}s{{end}} on disk

    + + +
    +

    Scrape a new book

    +
    + + +
    +
    +
    + + + +
    ` + +func (s *Server) handleHome(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + + books, err := s.writer.ListBooks() + if err != nil { + http.Error(w, "failed to list books: "+err.Error(), http.StatusInternalServerError) + return + } + + t := template.Must(template.New("home").Parse(homeTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct{ Books interface{} }{Books: books}) + + s.respond(w, r, "Home", buf.String()) +} + +// ─── GET /ranking — ranking page ─────────────────────────────────────────────── + +const rankingTmpl = ` +
    + + ← All books + + +
    +
    +

    Novel Rankings

    +

    Top novels from novelfire.net

    + {{if .CachedAt}}

    Cached {{.CachedAt}}

    {{end}} +
    +
    + + View Markdown + + +
    +
    + + +
    + {{range .Books}} + {{if .Local}} + {{/* Book is in local library — wrap entire card in a clickable link */}} + +
    + {{if .Cover}} + cover + {{end}} +
    +
    + {{if .Rank}}#{{.Rank}}{{end}} +

    {{.Title}}

    + In library +
    + {{if .Author}}

    {{.Author}}

    {{end}} +
    + {{if .Status}}{{.Status}}{{end}} +
    + {{if .Genres}} +
    + {{range .Genres}}{{.}}{{end}} +
    + {{end}} +
    +
    +
    + {{else}} + {{/* Book not yet in local library */}} +
    +
    + {{if .Cover}} + cover + {{end}} +
    +
    + {{if .Rank}}#{{.Rank}}{{end}} +

    {{.Title}}

    +
    + {{if .Author}}

    {{.Author}}

    {{end}} +
    + {{if .Status}}{{.Status}}{{end}} +
    + {{if .Genres}} +
    + {{range .Genres}}{{.}}{{end}} +
    + {{end}} + {{if .SourceURL}} +
    +
    + + +
    +
    + {{end}} +
    +
    +
    + {{end}} + {{else}} +

    No ranking data available. Click "Refresh Rankings" to fetch from novelfire.net.

    + {{end}} +
    +
    ` + +// rankingViewItem enriches a RankingItem with whether it is present in the +// local book library, so the template can highlight it differently. +type rankingViewItem struct { + writer.RankingItem + Local bool +} + +// toRankingViewItems annotates items with Local=true for slugs found in localSlugs. +func toRankingViewItems(items []writer.RankingItem, localSlugs map[string]bool) []rankingViewItem { + out := make([]rankingViewItem, len(items)) + for i, it := range items { + out[i] = rankingViewItem{ + RankingItem: it, + Local: localSlugs[it.Slug], + } + } + return out +} + +// handleRanking serves the ranking page from the cached ranking.md file. +// It does NOT trigger a live scrape; use POST /ranking/refresh for that. +func (s *Server) handleRanking(w http.ResponseWriter, r *http.Request) { + rankingItems, err := s.writer.ReadRankingItems() + if err != nil { + s.log.Error("failed to read cached ranking", "err", err) + } + + cachedAt := "" + if info, statErr := s.writer.RankingFileInfo(); statErr == nil { + cachedAt = info.ModTime().Format("Jan 2, 2006 at 15:04") + } + + t := template.Must(template.New("ranking").Parse(rankingTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct { + Books interface{} + CachedAt string + }{Books: toRankingViewItems(rankingItems, s.writer.LocalSlugs()), CachedAt: cachedAt}) + s.respond(w, r, "Rankings", buf.String()) +} + +// handleRankingRefresh triggers a live scrape of novelfire.net/ranking, +// persists the result to ranking.md, then re-renders the ranking page. +func (s *Server) handleRankingRefresh(w http.ResponseWriter, r *http.Request) { + ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second) + defer cancel() + + rankingCh, errCh := s.novel.ScrapeRanking(ctx) + + var rankingItems []writer.RankingItem + for { + select { + case meta, ok := <-rankingCh: + if !ok { + rankingCh = nil + continue + } + rankingItems = append(rankingItems, writer.RankingItem{ + Rank: meta.Ranking, + Slug: meta.Slug, + Title: meta.Title, + Author: meta.Author, + Cover: meta.Cover, + Status: meta.Status, + Genres: meta.Genres, + SourceURL: meta.SourceURL, + }) + case err, ok := <-errCh: + if !ok { + errCh = nil + continue + } + if err != nil { + s.log.Error("ranking scrape error", "err", err) + } + } + + if rankingCh == nil && errCh == nil { + break + } + } + + if len(rankingItems) > 0 { + if err := s.writer.WriteRanking(rankingItems); err != nil { + s.log.Error("failed to save ranking", "err", err) + } + } + + cachedAt := "" + if info, statErr := s.writer.RankingFileInfo(); statErr == nil { + cachedAt = info.ModTime().Format("Jan 2, 2006 at 15:04") + } + + t := template.Must(template.New("ranking").Parse(rankingTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct { + Books interface{} + CachedAt string + }{Books: toRankingViewItems(rankingItems, s.writer.LocalSlugs()), CachedAt: cachedAt}) + s.respond(w, r, "Rankings", buf.String()) +} + +// ─── GET /ranking/view — view ranking markdown ───────────────────────────────── + +const rankingViewTmpl = ` +
    + + ← Back to Rankings + + +

    Ranking Data

    + +
    + {{.HTML}} +
    +
    ` + +func (s *Server) handleRankingView(w http.ResponseWriter, r *http.Request) { + markdown, err := s.writer.ReadRanking() + if err != nil { + http.Error(w, "failed to read ranking: "+err.Error(), http.StatusInternalServerError) + return + } + if markdown == "" { + http.NotFound(w, r) + return + } + + var htmlBuf bytes.Buffer + if err := md.Convert([]byte(markdown), &htmlBuf); err != nil { + http.Error(w, "markdown render error: "+err.Error(), http.StatusInternalServerError) + return + } + + t := template.Must(template.New("rankingView").Parse(rankingViewTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct{ HTML template.HTML }{HTML: template.HTML(htmlBuf.String())}) + + s.respond(w, r, "Ranking Data", buf.String()) +} + +// ─── GET /books/{slug} — chapter list ──────────────────────────────────────── + +const bookTmpl = ` +
    + + ← All books + + +
    + {{if .Meta.Cover}} + cover + {{end}} +
    +
    +

    {{.Meta.Title}}

    + {{if .Meta.SourceURL}} +
    + + +
    + {{end}} +
    + {{if .Meta.Author}}

    {{.Meta.Author}}

    {{end}} +
    + {{if .Meta.Status}}{{.Meta.Status}}{{end}} + {{if .Meta.TotalChapters}}{{.Meta.TotalChapters}} ch total{{end}} + {{len .Chapters}} downloaded +
    + {{if .Meta.Summary}} +

    {{.Meta.Summary}}

    + {{end}} +
    +
    + +

    Chapters

    + +
    ` + +func (s *Server) handleBook(w http.ResponseWriter, r *http.Request) { + slug := r.PathValue("slug") + + meta, ok, err := s.writer.ReadMetadata(slug) + if err != nil { + http.Error(w, "failed to read metadata: "+err.Error(), http.StatusInternalServerError) + return + } + if !ok { + http.NotFound(w, r) + return + } + + chapters, err := s.writer.ListChapters(slug) + if err != nil { + http.Error(w, "failed to list chapters: "+err.Error(), http.StatusInternalServerError) + return + } + + t := template.Must(template.New("book").Parse(bookTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct { + Slug string + Meta interface{} + Chapters interface{} + }{Slug: slug, Meta: meta, Chapters: chapters}) + + s.respond(w, r, meta.Title, buf.String()) +} + +// ─── GET /books/{slug}/chapters/{n} — chapter reader ───────────────────────── + +const chapterTmpl = ` +
    +
    + + ← Chapter list + +
    + {{if .PrevN}} + + ← Prev + + {{end}} + {{if .NextN}} + + Next → + + {{end}} +
    +
    + + +
    +
    + + + + + + + + +
    + Speed + + 1.0× +
    + +
    + + + +
    + +
    + + +
    + {{.HTML}} +
    + +
    + {{if .PrevN}} + + ← Previous chapter + + {{else}}{{end}} + {{if .NextN}} + + Next chapter → + + {{else}}{{end}} +
    +
    + + + +` + +func (s *Server) handleChapter(w http.ResponseWriter, r *http.Request) { + slug := r.PathValue("slug") + n, err := strconv.Atoi(r.PathValue("n")) + if err != nil || n < 1 { + http.NotFound(w, r) + return + } + + raw, err := s.writer.ReadChapter(slug, n) + if err != nil { + http.NotFound(w, r) + return + } + + var htmlBuf bytes.Buffer + if err := md.Convert([]byte(raw), &htmlBuf); err != nil { + http.Error(w, "markdown render error: "+err.Error(), http.StatusInternalServerError) + return + } + + chapters, _ := s.writer.ListChapters(slug) + prevN, nextN := adjacentChapters(chapters, n) + + title := firstHeading(raw, fmt.Sprintf("Chapter %d", n)) + + t := template.Must(template.New("chapter").Parse(chapterTmpl)) + var buf bytes.Buffer + _ = t.Execute(&buf, struct { + Slug string + HTML template.HTML + PrevN int + NextN int + ChapterN int + KokoroURL string + Voices []string + DefaultVoice string + }{ + Slug: slug, + HTML: template.HTML(htmlBuf.String()), + PrevN: prevN, + NextN: nextN, + ChapterN: n, + KokoroURL: s.kokoroURL, + Voices: kokoroVoices, + DefaultVoice: s.kokoroVoice, + }) + + s.respond(w, r, title, buf.String()) +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +// stripMarkdown removes Markdown syntax and returns clean plain text. +func stripMarkdown(src string) string { + src = regexp.MustCompile(`(?m)^#{1,6}\s+`).ReplaceAllString(src, "") + src = regexp.MustCompile(`\*{1,3}|_{1,3}`).ReplaceAllString(src, "") + src = regexp.MustCompile("(?s)```.*?```").ReplaceAllString(src, "") + src = regexp.MustCompile("`[^`]*`").ReplaceAllString(src, "") + src = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(src, "$1") + src = regexp.MustCompile(`!\[[^\]]*\]\([^)]+\)`).ReplaceAllString(src, "") + src = regexp.MustCompile(`(?m)^>\s?`).ReplaceAllString(src, "") + src = regexp.MustCompile(`(?m)^[-*_]{3,}\s*$`).ReplaceAllString(src, "") + src = regexp.MustCompile(`\n{3,}`).ReplaceAllString(src, "\n\n") + return strings.TrimSpace(src) +} + +// adjacentChapters returns the chapter numbers immediately before and after n +// in the sorted chapters list. 0 means "does not exist". +func adjacentChapters(chapters []writer.ChapterInfo, n int) (prev, next int) { + for i, ch := range chapters { + if ch.Number == n { + if i > 0 { + prev = chapters[i-1].Number + } + if i < len(chapters)-1 { + next = chapters[i+1].Number + } + return + } + } + return +} + +// firstHeading returns the text of the first non-empty line, stripping a +// leading "# " markdown heading marker. Falls back to fallback. +func firstHeading(md, fallback string) string { + for _, line := range strings.SplitN(md, "\n", 20) { + line = strings.TrimSpace(line) + if line == "" { + continue + } + return strings.TrimPrefix(line, "# ") + } + return fallback +} + +// ─── POST /ui/scrape/book — form submission ─────────────────────────────────── + +func (s *Server) handleUIScrapeBook(w http.ResponseWriter, r *http.Request) { + bookURL := strings.TrimSpace(r.FormValue("url")) + if bookURL == "" { + renderFragment(w, scrapeStatusHTML("error", "Please enter a book URL.")) + return + } + + s.mu.Lock() + already := s.running + if !already { + s.running = true + } + s.mu.Unlock() + + if already { + renderFragment(w, scrapeStatusHTML("busy", "A scrape job is already running. Please wait.")) + return + } + + cfg := s.oCfg + cfg.SingleBookURL = bookURL + + go func() { + defer func() { + s.mu.Lock() + s.running = false + s.mu.Unlock() + }() + + ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour) + defer cancel() + + o := orchestrator.New(cfg, s.novel, s.log) + if err := o.Run(ctx); err != nil { + s.log.Error("UI scrape job failed", "url", bookURL, "err", err) + } + }() + + // Return a status badge that polls until the job finishes. + renderFragment(w, scrapeStatusHTML("running", "Scraping "+bookURL+"…")) +} + +// ─── GET /ui/scrape/status — polling endpoint ───────────────────────────────── + +func (s *Server) handleUIScrapeStatus(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + running := s.running + s.mu.Unlock() + + if running { + // Keep polling every 3 s while the job is in progress. + renderFragment(w, scrapeStatusHTML("running", "Scraping in progress…")) + return + } + // Job finished — show a done badge and stop polling. + renderFragment(w, scrapeStatusHTML("done", "Done! Refresh the page to see new books.")) +} + +// scrapeStatusHTML returns a self-contained status badge fragment. +// state is one of: "running" | "done" | "busy" | "error". +func scrapeStatusHTML(state, msg string) string { + var colour, dot, poll string + switch state { + case "running": + colour = "text-amber-300 bg-amber-950 border-amber-800" + dot = `` + poll = `hx-get="/ui/scrape/status" hx-trigger="every 3s" hx-target="this" hx-swap="outerHTML"` + case "done": + colour = "text-green-300 bg-green-950 border-green-800" + dot = `` + case "busy": + colour = "text-yellow-300 bg-yellow-950 border-yellow-800" + dot = `` + default: // error + colour = "text-red-300 bg-red-950 border-red-800" + dot = `` + } + return fmt.Sprintf( + `
    %s%s
    `, + colour, poll, dot, template.HTMLEscapeString(msg), + ) +} diff --git a/scraper/internal/writer/writer.go b/scraper/internal/writer/writer.go index a3092c1..167e0c2 100644 --- a/scraper/internal/writer/writer.go +++ b/scraper/internal/writer/writer.go @@ -19,6 +19,7 @@ import ( "fmt" "os" "path/filepath" + "regexp" "sort" "strconv" "strings" @@ -144,10 +145,32 @@ func (w *Writer) ListBooks() ([]scraper.BookMeta, error) { return books, nil } +// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk. +// It is cheaper than ListBooks because it only checks for file existence rather +// than fully parsing every YAML file. +func (w *Writer) LocalSlugs() map[string]bool { + entries, err := os.ReadDir(w.root) + if err != nil { + return map[string]bool{} + } + slugs := make(map[string]bool, len(entries)) + for _, e := range entries { + if !e.IsDir() { + continue + } + metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml") + if _, err := os.Stat(metaPath); err == nil { + slugs[e.Name()] = true + } + } + return slugs +} + // ChapterInfo is a lightweight chapter descriptor derived from on-disk files. type ChapterInfo struct { Number int - Title string // first line of the markdown file (without the leading "# ") + Title string // chapter name, cleaned of number prefix and trailing date + Date string // relative date scraped alongside the title, e.g. "1 year ago" } // ListChapters returns all chapters on disk for slug, sorted by number. @@ -171,8 +194,8 @@ func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) { if err != nil { continue } - title := chapterTitle(f, n) - chapters = append(chapters, ChapterInfo{Number: n, Title: title}) + title, date := chapterTitle(f, n) + chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date}) } } } @@ -184,19 +207,55 @@ func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) { // chapterTitle reads the first non-empty line of a markdown file and strips // the leading "# " heading marker. Falls back to "Chapter N". -func chapterTitle(path string, n int) string { +func chapterTitle(path string, n int) (title, date string) { data, err := os.ReadFile(path) if err != nil { - return fmt.Sprintf("Chapter %d", n) + return fmt.Sprintf("Chapter %d", n), "" } for _, line := range strings.SplitN(string(data), "\n", 10) { line = strings.TrimSpace(line) if line == "" { continue } - return strings.TrimPrefix(line, "# ") + line = strings.TrimPrefix(line, "# ") + return splitChapterTitle(line) } - return fmt.Sprintf("Chapter %d", n) + return fmt.Sprintf("Chapter %d", n), "" +} + +// splitChapterTitle separates the human-readable chapter name from the +// trailing relative-date string that novelfire.net appends to the heading. +// Examples of raw heading text (after stripping "# "): +// +// "1 Chapter 1 - 1: The Academy's Weakest1 year ago" +// "2 Chapter 2 - Enter the Storm3 months ago" +// +// The pattern is: optional leading number+whitespace, then the real title, +// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/ +func splitChapterTitle(raw string) (title, date string) { + // Strip a leading chapter-number index that novelfire sometimes prepends. + // It looks like "1 " or "12 " at the very start. + raw = strings.TrimSpace(raw) + if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 { + prefix := raw[:idx] + allDigit := true + for _, c := range prefix { + if c < '0' || c > '9' { + allDigit = false + break + } + } + if allDigit { + raw = strings.TrimSpace(raw[idx:]) + } + } + + // Match a trailing relative date: " [s] ago" + dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`) + if m := dateRe.FindStringSubmatchIndex(raw); m != nil { + return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]]) + } + return raw, "" } // ReadChapter returns the raw markdown content for chapter number n of slug. @@ -211,6 +270,139 @@ func (w *Writer) ReadChapter(slug string, n int) (string, error) { return string(data), nil } +// ─── Ranking ───────────────────────────────────────────────────────────────── + +// RankingItem represents a single entry in the ranking. +type RankingItem struct { + Rank int `yaml:"rank"` + Slug string `yaml:"slug"` + Title string `yaml:"title"` + Author string `yaml:"author,omitempty"` + Cover string `yaml:"cover,omitempty"` + Status string `yaml:"status,omitempty"` + Genres []string `yaml:"genres,omitempty"` + SourceURL string `yaml:"source_url,omitempty"` +} + +// WriteRanking saves the ranking items as markdown to static/ranking.md. +func (w *Writer) WriteRanking(items []RankingItem) error { + path := filepath.Clean(w.rankingPath()) + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("writer: mkdir %s: %w", dir, err) + } + + var sb strings.Builder + sb.WriteString("# Novel Rankings\n\n") + sb.WriteString("| Rank | Title | Cover | Status | Genres | URL |\n") + sb.WriteString("|------|-------|-------|--------|--------|-----|\n") + for _, item := range items { + genres := strings.Join(item.Genres, ", ") + if genres == "" { + genres = "-" + } + sb.WriteString(fmt.Sprintf("| %d | %s | %s | %s | %s | %s |\n", + item.Rank, item.Title, item.Cover, item.Status, genres, item.SourceURL)) + } + + if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil { + return fmt.Errorf("writer: write ranking %s: %w", path, err) + } + return nil +} + +// ReadRanking reads the ranking.md file if it exists. +func (w *Writer) ReadRanking() (string, error) { + path := w.rankingPath() + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return "", nil + } + return "", fmt.Errorf("writer: read ranking: %w", err) + } + return string(data), nil +} + +// ReadRankingItems parses ranking.md back into a slice of RankingItem. +// Returns nil slice (not an error) when the file does not exist yet. +func (w *Writer) ReadRankingItems() ([]RankingItem, error) { + markdown, err := w.ReadRanking() + if err != nil || markdown == "" { + return nil, err + } + + var items []RankingItem + for _, line := range strings.Split(markdown, "\n") { + // Only process data rows: start and end with '|', not header/separator rows. + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "|") || !strings.HasSuffix(line, "|") { + continue + } + // Strip leading/trailing '|' and split on '|'. + inner := strings.TrimPrefix(strings.TrimSuffix(line, "|"), "|") + cols := strings.Split(inner, "|") + if len(cols) < 6 { + continue + } + for i, c := range cols { + cols[i] = strings.TrimSpace(c) + } + // Skip header row and separator row. + if cols[0] == "Rank" || strings.HasPrefix(cols[0], "---") { + continue + } + rank, err := strconv.Atoi(cols[0]) + if err != nil { + continue + } + title := cols[1] + cover := cols[2] + status := cols[3] + genresStr := cols[4] + sourceURL := cols[5] + + var genres []string + if genresStr != "-" && genresStr != "" { + for _, g := range strings.Split(genresStr, ",") { + g = strings.TrimSpace(g) + if g != "" { + genres = append(genres, g) + } + } + } + + // Derive slug from source URL (last path segment). + slug := "" + if sourceURL != "" { + parts := strings.Split(strings.TrimRight(sourceURL, "/"), "/") + if len(parts) > 0 { + slug = parts[len(parts)-1] + } + } + + items = append(items, RankingItem{ + Rank: rank, + Slug: slug, + Title: title, + Cover: cover, + Status: status, + Genres: genres, + SourceURL: sourceURL, + }) + } + return items, nil +} + +// RankingFileInfo returns os.FileInfo for the ranking.md file, if it exists. +func (w *Writer) RankingFileInfo() (os.FileInfo, error) { + return os.Stat(w.rankingPath()) +} + +func (w *Writer) rankingPath() string { + return filepath.Join(w.root, "ranking.md") +} + // bookDir returns the root directory for a book slug. func (w *Writer) bookDir(slug string) string { return filepath.Join(w.root, slug)