feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
This commit is contained in:
20
.aiignore
Normal file
20
.aiignore
Normal file
@@ -0,0 +1,20 @@
|
||||
# AI Indexing Ignore
|
||||
# These directories/files are excluded from AI context indexing for speed
|
||||
|
||||
# Generated/scraped content
|
||||
scraper/static/
|
||||
|
||||
# Build artifacts
|
||||
scraper/bin/
|
||||
*.exe
|
||||
|
||||
# Dependencies (if using Go modules, the AI doesn't need vendor/)
|
||||
# vendor/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
22
.env.example
22
.env.example
@@ -13,11 +13,29 @@ BROWSERLESS_QUEUED=100
|
||||
# Per-session timeout in ms
|
||||
BROWSERLESS_TIMEOUT=60000
|
||||
|
||||
# Which Browserless strategy the scraper uses: content | scrape | cdp
|
||||
BROWSERLESS_STRATEGY=content
|
||||
# Optional webhook URL for Browserless error alerts (leave empty to disable)
|
||||
ERROR_ALERT_URL=
|
||||
|
||||
# Which Browserless strategy the scraper uses: content | scrape | cdp | direct
|
||||
BROWSERLESS_STRATEGY=direct
|
||||
|
||||
# Strategy for URL retrieval (chapter list). Uses browserless content strategy by default.
|
||||
# Set to direct to use plain HTTP, or content/scrape/cdp for browserless.
|
||||
BROWSERLESS_URL_STRATEGY=content
|
||||
|
||||
# Chapter worker goroutines (0 = NumCPU inside the container)
|
||||
SCRAPER_WORKERS=0
|
||||
|
||||
# Host path to mount as the static output directory
|
||||
STATIC_ROOT=./static/books
|
||||
|
||||
# ── Kokoro-FastAPI TTS ────────────────────────────────────────────────────────
|
||||
# Base URL for the Kokoro-FastAPI service. When running via docker-compose the
|
||||
# default (http://kokoro:8880) is wired in automatically; override here only if
|
||||
# you are pointing at an external or GPU instance.
|
||||
KOKORO_URL=http://kokoro:8880
|
||||
|
||||
# Default voice used for chapter narration.
|
||||
# Single voices: af_bella, af_sky, af_heart, am_adam, …
|
||||
# Mixed voices: af_bella+af_sky or af_bella(2)+af_sky(1) (weighted blend)
|
||||
KOKORO_VOICE=af_bella
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,7 +1,3 @@
|
||||
# ── Compiled binary ────────────────────────────────────────────────────────────
|
||||
/scraper
|
||||
/scraper-*
|
||||
|
||||
# ── Go toolchain ───────────────────────────────────────────────────────────────
|
||||
*.test
|
||||
*.out
|
||||
|
||||
89
AGENTS.md
Normal file
89
AGENTS.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# libnovel Project
|
||||
|
||||
Go web scraper for novelfire.net with TTS support via Kokoro-FastAPI.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
scraper/
|
||||
├── cmd/scraper/main.go # Entry point: 'run' (one-shot) and 'serve' (HTTP server)
|
||||
├── internal/
|
||||
│ ├── orchestrator/orchestrator.go # Coordinates catalogue walk, metadata extraction, chapter scraping
|
||||
│ ├── browser/ # Browser client (content/scrape/cdp strategies) via Browserless
|
||||
│ ├── novelfire/scraper.go # novelfire.net specific scraping logic
|
||||
│ ├── server/server.go # HTTP API (POST /scrape, POST /scrape/book)
|
||||
│ ├── writer/writer.go # File writer (metadata.yaml, chapter .md files)
|
||||
│ └── scraper/interfaces.go # NovelScraper interface definition
|
||||
└── static/books/ # Output directory for scraped content
|
||||
```
|
||||
|
||||
## Key Concepts
|
||||
|
||||
- **Orchestrator**: Manages concurrency - catalogue streaming → per-book metadata goroutines → chapter worker pool
|
||||
- **Browser Client**: 3 strategies (content/scrape/cdp) via Browserless Chrome container
|
||||
- **Writer**: Writes metadata.yaml and chapter markdown files to `static/books/{slug}/vol-0/1-50/`
|
||||
- **Server**: HTTP API with async scrape jobs, UI for browsing books/chapters, chapter-text endpoint for TTS
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
# Build
|
||||
cd scraper && go build -o bin/scraper ./cmd/scraper
|
||||
|
||||
# One-shot scrape (full catalogue)
|
||||
./bin/scraper run
|
||||
|
||||
# Single book
|
||||
./bin/scraper run --url https://novelfire.net/book/xxx
|
||||
|
||||
# HTTP server
|
||||
./bin/scraper serve
|
||||
|
||||
# Tests
|
||||
cd scraper && go test ./...
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| BROWSERLESS_URL | Browserless Chrome endpoint | http://localhost:3000 |
|
||||
| BROWSERLESS_STRATEGY | content \| scrape \| cdp | content |
|
||||
| SCRAPER_WORKERS | Chapter goroutines | NumCPU |
|
||||
| SCRAPER_STATIC_ROOT | Output directory | ./static/books |
|
||||
| SCRAPER_HTTP_ADDR | HTTP listen address | :8080 |
|
||||
| KOKORO_URL | Kokoro TTS endpoint | http://localhost:8880 |
|
||||
| KOKORO_VOICE | Default TTS voice | af_bella |
|
||||
| LOG_LEVEL | debug \| info \| warn \| error | info |
|
||||
|
||||
## Docker
|
||||
|
||||
```bash
|
||||
docker-compose up -d # Starts browserless, kokoro, scraper
|
||||
```
|
||||
|
||||
## Code Patterns
|
||||
|
||||
- Uses `log/slog` for structured logging
|
||||
- Context-based cancellation throughout
|
||||
- Worker pool pattern in orchestrator (channel + goroutines)
|
||||
- Mutex for single async job (409 on concurrent scrape requests)
|
||||
|
||||
## AI Context Tips
|
||||
|
||||
- Primary files to modify: `orchestrator.go`, `server.go`, `scraper.go`, `browser/*.go`
|
||||
- To add new source: implement `NovelScraper` interface from `internal/scraper/interfaces.go`
|
||||
- Skip `static/` directory - generated content, not source
|
||||
|
||||
## Speed Up AI Sessions (Optional)
|
||||
|
||||
For faster AI context loading, use **Context7** (free, local indexing):
|
||||
|
||||
```bash
|
||||
# Install and index once
|
||||
npx @context7/cli@latest index --path . --ignore .aiignore
|
||||
|
||||
# After first run, AI tools will query the index instead of re-scanning files
|
||||
```
|
||||
|
||||
VSCode extension: https://marketplace.visualstudio.com/items?itemName=context7.context7
|
||||
@@ -16,6 +16,8 @@ services:
|
||||
QUEUED: "${BROWSERLESS_QUEUED:-100}"
|
||||
# Per-session timeout in ms.
|
||||
TIMEOUT: "${BROWSERLESS_TIMEOUT:-60000}"
|
||||
# Optional webhook URL for Browserless error alerts.
|
||||
ERROR_ALERT_URL: "${ERROR_ALERT_URL:-}"
|
||||
ports:
|
||||
- "3000:3000"
|
||||
# Shared memory is required for Chrome.
|
||||
@@ -26,6 +28,21 @@ services:
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Kokoro-FastAPI (TTS) ────────────────────────────────────────────────────
|
||||
# CPU image; swap for ghcr.io/remsky/kokoro-fastapi-gpu:latest on NVIDIA hosts.
|
||||
# Models are baked in — no volume mount required for the default voice set.
|
||||
kokoro:
|
||||
image: ghcr.io/remsky/kokoro-fastapi-cpu:latest
|
||||
container_name: libnovel-kokoro
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8880:8880"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Scraper ─────────────────────────────────────────────────────────────────
|
||||
scraper:
|
||||
build:
|
||||
@@ -34,17 +51,23 @@ services:
|
||||
container_name: libnovel-scraper
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
browserless:
|
||||
kokoro:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
BROWSERLESS_URL: "http://browserless:3000"
|
||||
BROWSERLESS_TOKEN: "${BROWSERLESS_TOKEN:-}"
|
||||
# content | scrape | cdp — swap to test different strategies.
|
||||
BROWSERLESS_STRATEGY: "${BROWSERLESS_STRATEGY:-content}"
|
||||
# content | scrape | cdp | direct — swap to test different strategies.
|
||||
BROWSERLESS_STRATEGY: "${BROWSERLESS_STRATEGY:-direct}"
|
||||
# Strategy for URL retrieval (chapter list). Default: content (browserless)
|
||||
BROWSERLESS_URL_STRATEGY: "${BROWSERLESS_URL_STRATEGY:-content}"
|
||||
# 0 → defaults to NumCPU inside the container.
|
||||
SCRAPER_WORKERS: "${SCRAPER_WORKERS:-0}"
|
||||
SCRAPER_STATIC_ROOT: "/app/static/books"
|
||||
SCRAPER_HTTP_ADDR: ":8080"
|
||||
LOG_LEVEL: "debug"
|
||||
# Kokoro-FastAPI TTS endpoint.
|
||||
KOKORO_URL: "${KOKORO_URL:-http://localhost:8880}"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE:-af_bella}"
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
|
||||
// SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
// KOKORO_URL Kokoro-FastAPI base URL (default: "")
|
||||
// KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
// LOG_LEVEL debug | info | warn | error (default: info)
|
||||
package main
|
||||
|
||||
@@ -30,11 +32,13 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/novelfire"
|
||||
"github.com/libnovel/scraper/internal/orchestrator"
|
||||
"github.com/libnovel/scraper/internal/server"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -73,11 +77,18 @@ func run(log *slog.Logger) error {
|
||||
browserCfg.MaxConcurrent = n
|
||||
}
|
||||
}
|
||||
if s := os.Getenv("BROWSERLESS_TIMEOUT"); s != "" {
|
||||
if n, err := strconv.Atoi(s); err == nil && n > 0 {
|
||||
browserCfg.Timeout = time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyContent))))
|
||||
strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyDirect))))
|
||||
urlStrategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_URL_STRATEGY", string(browser.StrategyContent))))
|
||||
bc := newBrowserClient(strategy, browserCfg)
|
||||
urlClient := newBrowserClient(urlStrategy, browserCfg)
|
||||
|
||||
nf := novelfire.New(bc, log)
|
||||
nf := novelfire.New(bc, log, urlClient)
|
||||
|
||||
workers := 0
|
||||
if s := os.Getenv("SCRAPER_WORKERS"); s != "" {
|
||||
@@ -114,15 +125,44 @@ func run(log *slog.Logger) error {
|
||||
o := orchestrator.New(oCfg, nf, log)
|
||||
return o.Run(ctx)
|
||||
|
||||
case "refresh":
|
||||
// refresh <slug> - re-scrape a book from its saved source_url
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("refresh command requires a book slug argument")
|
||||
}
|
||||
slug := args[1]
|
||||
w := writer.New(oCfg.StaticRoot)
|
||||
meta, ok, err := w.ReadMetadata(slug)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read metadata for %s: %w", slug, err)
|
||||
}
|
||||
if !ok {
|
||||
return fmt.Errorf("book %q not found in %s", slug, oCfg.StaticRoot)
|
||||
}
|
||||
if meta.SourceURL == "" {
|
||||
return fmt.Errorf("book %q has no source_url in metadata", slug)
|
||||
}
|
||||
oCfg.SingleBookURL = meta.SourceURL
|
||||
log.Info("refreshing book from source_url",
|
||||
"slug", slug,
|
||||
"source_url", meta.SourceURL,
|
||||
)
|
||||
o := orchestrator.New(oCfg, nf, log)
|
||||
return o.Run(ctx)
|
||||
|
||||
case "serve":
|
||||
addr := envOr("SCRAPER_HTTP_ADDR", ":8080")
|
||||
kokoroURL := envOr("KOKORO_URL", "")
|
||||
kokoroVoice := envOr("KOKORO_VOICE", "af_bella")
|
||||
log.Info("starting HTTP server",
|
||||
"addr", addr,
|
||||
"strategy", strategy,
|
||||
"workers", workers,
|
||||
"max_concurrent", browserCfg.MaxConcurrent,
|
||||
"kokoro_url", kokoroURL,
|
||||
"kokoro_voice", kokoroVoice,
|
||||
)
|
||||
srv := server.New(addr, oCfg, nf, log)
|
||||
srv := server.New(addr, oCfg, nf, log, kokoroURL, kokoroVoice)
|
||||
return srv.ListenAndServe(ctx)
|
||||
|
||||
default:
|
||||
@@ -136,6 +176,8 @@ func newBrowserClient(strategy browser.Strategy, cfg browser.Config) browser.Bro
|
||||
return browser.NewScrapeClient(cfg)
|
||||
case browser.StrategyCDP:
|
||||
return browser.NewCDPClient(cfg)
|
||||
case browser.StrategyDirect:
|
||||
return browser.NewDirectHTTPClient(cfg)
|
||||
default:
|
||||
return browser.NewContentClient(cfg)
|
||||
}
|
||||
@@ -153,16 +195,21 @@ func printUsage() {
|
||||
|
||||
Commands:
|
||||
run [--url <book-url>] One-shot: scrape full catalogue, or a single book
|
||||
refresh <slug> Re-scrape a book from its saved source_url
|
||||
serve Start HTTP server (POST /scrape, POST /scrape/book)
|
||||
|
||||
Environment variables:
|
||||
BROWSERLESS_URL Browserless base URL (default: http://localhost:3000)
|
||||
BROWSERLESS_TOKEN API token (default: "")
|
||||
BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
|
||||
BROWSERLESS_STRATEGY content|scrape|cdp|direct (default: direct)
|
||||
BROWSERLESS_URL_STRATEGY Strategy for URL retrieval (default: content)
|
||||
BROWSERLESS_MAX_CONCURRENT Max simultaneous sessions (default: 5)
|
||||
BROWSERLESS_TIMEOUT HTTP request timeout sec (default: 90)
|
||||
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
|
||||
SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
KOKORO_URL Kokoro-FastAPI base URL (default: "", TTS disabled)
|
||||
KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
LOG_LEVEL debug|info|warn|error (default: info)
|
||||
`, runtime.NumCPU())
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ type contentClient struct {
|
||||
// NewContentClient returns a BrowserClient that uses POST /content.
|
||||
func NewContentClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 60 * time.Second
|
||||
cfg.Timeout = 90 * time.Second
|
||||
}
|
||||
return &contentClient{
|
||||
cfg: cfg,
|
||||
@@ -135,7 +135,7 @@ type scrapeClient struct {
|
||||
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
|
||||
func NewScrapeClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 60 * time.Second
|
||||
cfg.Timeout = 90 * time.Second
|
||||
}
|
||||
return &scrapeClient{
|
||||
cfg: cfg,
|
||||
|
||||
68
scraper/internal/browser/http.go
Normal file
68
scraper/internal/browser/http.go
Normal file
@@ -0,0 +1,68 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type httpClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
func NewDirectHTTPClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 30 * time.Second
|
||||
}
|
||||
return &httpClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *httpClient) Strategy() Strategy { return StrategyDirect }
|
||||
|
||||
func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return "", fmt.Errorf("http: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, req.URL, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: do request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: read body: %w", err)
|
||||
}
|
||||
return string(raw), nil
|
||||
}
|
||||
|
||||
func (c *httpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
||||
return ScrapeResponse{}, fmt.Errorf("http client does not support ScrapePage; use browserless")
|
||||
}
|
||||
|
||||
func (c *httpClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
||||
return fmt.Errorf("http client does not support CDP; use browserless")
|
||||
}
|
||||
152
scraper/internal/browser/integration_test.go
Normal file
152
scraper/internal/browser/integration_test.go
Normal file
@@ -0,0 +1,152 @@
|
||||
//go:build integration
|
||||
|
||||
// Integration tests for the Browserless /content API.
|
||||
//
|
||||
// These tests require a live Browserless instance and are gated behind the
|
||||
// "integration" build tag so they never run in normal `go test ./...` passes.
|
||||
//
|
||||
// Run them with:
|
||||
//
|
||||
// BROWSERLESS_URL=http://localhost:3000 \
|
||||
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
||||
// go test -v -tags integration -timeout 120s \
|
||||
// github.com/libnovel/scraper/internal/browser
|
||||
package browser_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
)
|
||||
|
||||
// chapterURL is the novelfire chapter used in every integration sub-test.
|
||||
const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1"
|
||||
|
||||
// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
||||
// environment and returns a configured contentClient.
|
||||
// The test is skipped when BROWSERLESS_URL is not set.
|
||||
func newIntegrationClient(t *testing.T) browser.BrowserClient {
|
||||
t.Helper()
|
||||
baseURL := os.Getenv("BROWSERLESS_URL")
|
||||
if baseURL == "" {
|
||||
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
||||
}
|
||||
return browser.NewContentClient(browser.Config{
|
||||
BaseURL: baseURL,
|
||||
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
||||
// Use a generous per-request HTTP timeout so the wait-for-selector
|
||||
// (75 s) doesn't get cut off by the transport layer.
|
||||
Timeout: 120 * time.Second,
|
||||
MaxConcurrent: 1,
|
||||
})
|
||||
}
|
||||
|
||||
// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content
|
||||
// request with the production wait-for-selector settings succeeds and that the
|
||||
// returned HTML contains the #content div expected on novelfire chapter pages.
|
||||
func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) {
|
||||
client := newIntegrationClient(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req := browser.ContentRequest{
|
||||
URL: chapterURL,
|
||||
WaitFor: &browser.WaitForSelector{
|
||||
Selector: "#content",
|
||||
Timeout: 5000,
|
||||
},
|
||||
RejectResourceTypes: productionRejectTypes(),
|
||||
}
|
||||
|
||||
html, err := client.GetContent(ctx, req)
|
||||
if err != nil {
|
||||
t.Fatalf("GetContent failed: %v", err)
|
||||
}
|
||||
|
||||
// The #content div must not be empty; presence of <p> tags inside it is a
|
||||
// reliable indicator that chapter paragraphs were rendered.
|
||||
contentIdx := strings.Index(html, `id="content"`)
|
||||
if contentIdx == -1 {
|
||||
t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html))
|
||||
}
|
||||
|
||||
// Look for <p> tags after the #content marker — the chapter text lives there.
|
||||
afterContent := html[contentIdx:]
|
||||
if !strings.Contains(afterContent, "<p") {
|
||||
t.Errorf("#content section contains no <p> tags; JS rendering may have failed.\nSection preview:\n%s",
|
||||
truncate(afterContent, 1000))
|
||||
}
|
||||
|
||||
t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html))
|
||||
}
|
||||
|
||||
// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a
|
||||
// deliberately too-short timeout returns an error containing "TimeoutError" (the
|
||||
// Browserless error string seen in the failing log entry). This ensures our
|
||||
// error-classification logic in retryGetContent matches real Browserless output.
|
||||
func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) {
|
||||
client := newIntegrationClient(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req := browser.ContentRequest{
|
||||
URL: chapterURL,
|
||||
WaitFor: &browser.WaitForSelector{
|
||||
Selector: "#content",
|
||||
Timeout: 500, // intentionally too short (500 ms) → Browserless will time out
|
||||
},
|
||||
RejectResourceTypes: productionRejectTypes(),
|
||||
}
|
||||
|
||||
_, err := client.GetContent(ctx, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " +
|
||||
"the page may now load very fast; adjust the timeout threshold")
|
||||
}
|
||||
|
||||
t.Logf("got expected error: %v", err)
|
||||
|
||||
// Browserless wraps navigation timeouts in a 500 response with
|
||||
// "TimeoutError: Navigation timeout" in the body — this is the exact
|
||||
// error that is triggering retries in production.
|
||||
if !strings.Contains(err.Error(), "500") {
|
||||
t.Errorf("expected HTTP 500 status in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
// productionRejectTypes returns the same resource-type block-list the
|
||||
// novelfire scraper uses in production, so integration tests exercise the
|
||||
// identical request shape.
|
||||
func productionRejectTypes() []string {
|
||||
return []string{
|
||||
"cspviolationreport",
|
||||
"eventsource",
|
||||
"fedcm",
|
||||
"font",
|
||||
"image",
|
||||
"manifest",
|
||||
"media",
|
||||
"other",
|
||||
"ping",
|
||||
"signedexchange",
|
||||
"stylesheet",
|
||||
"texttrack",
|
||||
"websocket",
|
||||
}
|
||||
}
|
||||
|
||||
// truncate returns the first n bytes of s as a string.
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
@@ -21,6 +21,10 @@ const (
|
||||
// DevTools Protocol). Most powerful; required for complex interactions
|
||||
// (clicking, scrolling, waiting for network idle, etc.).
|
||||
StrategyCDP Strategy = "cdp"
|
||||
|
||||
// StrategyDirect uses a plain HTTP client to fetch HTML directly.
|
||||
// Suitable for sites that don't require JavaScript rendering.
|
||||
StrategyDirect Strategy = "direct"
|
||||
)
|
||||
|
||||
// WaitForSelector describes the waitForSelector option sent to Browserless.
|
||||
@@ -29,12 +33,20 @@ type WaitForSelector struct {
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
}
|
||||
|
||||
// GotoOptions controls page navigation behavior.
|
||||
type GotoOptions struct {
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load"
|
||||
}
|
||||
|
||||
// ContentRequest is the body sent to POST /content.
|
||||
type ContentRequest struct {
|
||||
URL string `json:"url"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
|
||||
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
|
||||
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
||||
BestAttempt bool `json:"bestAttempt,omitempty"` // return partial content on timeout/error
|
||||
}
|
||||
|
||||
// ScrapeElement is one element descriptor inside a ScrapeRequest.
|
||||
@@ -45,9 +57,10 @@ type ScrapeElement struct {
|
||||
|
||||
// ScrapeRequest is the body sent to POST /scrape.
|
||||
type ScrapeRequest struct {
|
||||
URL string `json:"url"`
|
||||
Elements []ScrapeElement `json:"elements"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
URL string `json:"url"`
|
||||
Elements []ScrapeElement `json:"elements"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResult is one entry in the response from POST /scrape.
|
||||
|
||||
344
scraper/internal/novelfire/integration_test.go
Normal file
344
scraper/internal/novelfire/integration_test.go
Normal file
@@ -0,0 +1,344 @@
|
||||
//go:build integration
|
||||
|
||||
// Integration tests for the novelfire.net Scraper against a live Browserless instance.
|
||||
//
|
||||
// These tests exercise the full scraping stack — Browserless → raw HTML →
|
||||
// novelfire HTML parser — for the book:
|
||||
//
|
||||
// https://novelfire.net/book/a-dragon-against-the-whole-world
|
||||
//
|
||||
// They are gated behind the "integration" build tag so they never run in a
|
||||
// normal `go test ./...` pass.
|
||||
//
|
||||
// Run with:
|
||||
//
|
||||
// BROWSERLESS_URL=http://localhost:3000 \
|
||||
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
||||
// go test -v -tags integration -timeout 600s \
|
||||
// github.com/libnovel/scraper/internal/novelfire
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
)
|
||||
|
||||
const (
|
||||
integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world"
|
||||
integrationBookSlug = "a-dragon-against-the-whole-world"
|
||||
integrationBookTitle = "A Dragon against the Whole World"
|
||||
)
|
||||
|
||||
// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
||||
// environment, constructs a real contentClient, and returns a novelfire Scraper
|
||||
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
|
||||
func newIntegrationScraper(t *testing.T) *Scraper {
|
||||
t.Helper()
|
||||
baseURL := os.Getenv("BROWSERLESS_URL")
|
||||
if baseURL == "" {
|
||||
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
||||
}
|
||||
client := browser.NewContentClient(browser.Config{
|
||||
BaseURL: baseURL,
|
||||
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
||||
Timeout: 120 * time.Second,
|
||||
MaxConcurrent: 1,
|
||||
})
|
||||
return New(client, nil)
|
||||
}
|
||||
|
||||
// ── Metadata ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
|
||||
// ScrapeMetadata fetches the book page and correctly parses at minimum
|
||||
// the slug, title, and source URL.
|
||||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("slug: %s", meta.Slug)
|
||||
t.Logf("title: %s", meta.Title)
|
||||
t.Logf("author: %s", meta.Author)
|
||||
t.Logf("status: %s", meta.Status)
|
||||
t.Logf("genres: %v", meta.Genres)
|
||||
t.Logf("total_chapters: %d", meta.TotalChapters)
|
||||
t.Logf("source_url: %s", meta.SourceURL)
|
||||
|
||||
if meta.Slug != integrationBookSlug {
|
||||
t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
|
||||
}
|
||||
if meta.Title == "" {
|
||||
t.Error("title is empty")
|
||||
}
|
||||
if !strings.EqualFold(meta.Title, integrationBookTitle) {
|
||||
// Warn rather than hard-fail — the site may reword the title.
|
||||
t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
|
||||
}
|
||||
if meta.SourceURL != integrationBookURL {
|
||||
t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
|
||||
// every optional field (author, status, genres, summary, total_chapters) is
|
||||
// populated. A missing field is a warning, not a hard failure, because the
|
||||
// site may change its HTML structure.
|
||||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
type check struct {
|
||||
field string
|
||||
empty bool
|
||||
}
|
||||
checks := []check{
|
||||
{"author", meta.Author == ""},
|
||||
{"status", meta.Status == ""},
|
||||
{"summary", meta.Summary == ""},
|
||||
{"genres", len(meta.Genres) == 0},
|
||||
{"total_chapters", meta.TotalChapters == 0},
|
||||
}
|
||||
for _, c := range checks {
|
||||
if c.empty {
|
||||
t.Errorf("field %q is empty — HTML selector may have broken", c.field)
|
||||
}
|
||||
}
|
||||
|
||||
// total_chapters must be a positive integer.
|
||||
if meta.TotalChapters < 1 {
|
||||
t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Chapter list ──────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
|
||||
// ScrapeChapterList returns a non-empty slice of chapter references with
|
||||
// valid URLs and numbers parsed from those URLs (not list position).
|
||||
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("total refs returned: %d", len(refs))
|
||||
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
// Every ref must have a non-empty URL pointing at the correct book.
|
||||
for i, ref := range refs {
|
||||
if ref.URL == "" {
|
||||
t.Errorf("refs[%d].URL is empty", i)
|
||||
}
|
||||
if !strings.Contains(ref.URL, integrationBookSlug) {
|
||||
t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
|
||||
}
|
||||
if ref.Number <= 0 {
|
||||
t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
|
||||
}
|
||||
if ref.Title == "" {
|
||||
t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
|
||||
// fix for the newest-first ordering bug: each ref's Number must equal the
|
||||
// chapter number embedded in its URL, not its position in the list.
|
||||
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
mismatches := 0
|
||||
for i, ref := range refs {
|
||||
wantNum := chapterNumberFromURL(ref.URL)
|
||||
if wantNum <= 0 {
|
||||
// URL has no parseable number — skip this entry.
|
||||
continue
|
||||
}
|
||||
if ref.Number != wantNum {
|
||||
t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
|
||||
i, ref.Number, ref.URL, wantNum)
|
||||
mismatches++
|
||||
if mismatches >= 5 {
|
||||
t.Log("… (further mismatches suppressed)")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log the first few refs so failures are easy to diagnose.
|
||||
limit := 5
|
||||
if len(refs) < limit {
|
||||
limit = len(refs)
|
||||
}
|
||||
for i := 0; i < limit; i++ {
|
||||
t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Chapters ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
|
||||
// via ScrapeChapterText and verifies each returns non-empty markdown text.
|
||||
// Chapters are run as sub-tests so a single failure does not abort the others.
|
||||
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
chapters := []scraper.ChapterRef{
|
||||
{
|
||||
Number: 1,
|
||||
Title: "Chapter 1",
|
||||
URL: integrationBookURL + "/chapter-1",
|
||||
},
|
||||
{
|
||||
Number: 2,
|
||||
Title: "Chapter 2",
|
||||
URL: integrationBookURL + "/chapter-2",
|
||||
},
|
||||
{
|
||||
Number: 3,
|
||||
Title: "Chapter 3",
|
||||
URL: integrationBookURL + "/chapter-3",
|
||||
},
|
||||
}
|
||||
|
||||
for _, ref := range chapters {
|
||||
ref := ref // capture
|
||||
t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
|
||||
// Sequential: each chapter needs its own generous timeout.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterText failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
|
||||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||||
|
||||
// Ref fields must be echoed back unchanged.
|
||||
if ch.Ref.Number != ref.Number {
|
||||
t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
|
||||
}
|
||||
if ch.Ref.URL != ref.URL {
|
||||
t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
|
||||
}
|
||||
|
||||
// Text must be non-trivially long.
|
||||
if len(ch.Text) < 100 {
|
||||
t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
|
||||
len(ch.Text), ch.Text)
|
||||
}
|
||||
|
||||
// Text must not contain raw HTML tags — NodeToMarkdown should have
|
||||
// stripped them.
|
||||
for _, tag := range []string{"<div", "<span", "<script", "<style"} {
|
||||
if strings.Contains(ch.Text, tag) {
|
||||
t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
|
||||
// variant: it first calls ScrapeChapterList to get the real refs (with
|
||||
// URL-derived numbers), then scrapes chapters 1–3 using those refs.
|
||||
// This catches any discrepancy between the list and the chapter URLs.
|
||||
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
// Step 1: fetch the chapter list.
|
||||
listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer listCancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
// Build a map number→ref for fast lookup.
|
||||
byNumber := make(map[int]scraper.ChapterRef, len(refs))
|
||||
for _, r := range refs {
|
||||
byNumber[r.Number] = r
|
||||
}
|
||||
|
||||
// Step 2: scrape chapters 1, 2, 3.
|
||||
for _, wantNum := range []int{1, 2, 3} {
|
||||
wantNum := wantNum
|
||||
ref, ok := byNumber[wantNum]
|
||||
if !ok {
|
||||
t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
|
||||
continue
|
||||
}
|
||||
|
||||
t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
|
||||
}
|
||||
|
||||
t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
|
||||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||||
|
||||
if len(ch.Text) < 100 {
|
||||
t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func truncateStr(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -25,6 +26,7 @@ import (
|
||||
const (
|
||||
baseURL = "https://novelfire.net"
|
||||
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
|
||||
rankingPath = "/ranking"
|
||||
)
|
||||
|
||||
// rejectResourceTypes lists Browserless resource types to block on every request.
|
||||
@@ -40,8 +42,6 @@ var rejectResourceTypes = []string{
|
||||
"media",
|
||||
"other",
|
||||
"ping",
|
||||
"prefetch",
|
||||
"preflight",
|
||||
"signedexchange",
|
||||
"stylesheet",
|
||||
"texttrack",
|
||||
@@ -51,16 +51,22 @@ var rejectResourceTypes = []string{
|
||||
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
|
||||
// It uses the /content strategy by default (rendered HTML via Browserless).
|
||||
type Scraper struct {
|
||||
client browser.BrowserClient
|
||||
log *slog.Logger
|
||||
client browser.BrowserClient
|
||||
urlClient browser.BrowserClient // separate client for URL retrieval (uses browserless content strategy)
|
||||
log *slog.Logger
|
||||
}
|
||||
|
||||
// New returns a new novelfire Scraper.
|
||||
func New(client browser.BrowserClient, log *slog.Logger) *Scraper {
|
||||
// client is used for content fetching, urlClient is used for URL retrieval (chapter list).
|
||||
// If urlClient is nil, client will be used for both.
|
||||
func New(client browser.BrowserClient, log *slog.Logger, urlClient browser.BrowserClient) *Scraper {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &Scraper{client: client, log: log}
|
||||
if urlClient == nil {
|
||||
urlClient = client
|
||||
}
|
||||
return &Scraper{client: client, urlClient: urlClient, log: log}
|
||||
}
|
||||
|
||||
// SourceName implements NovelScraper.
|
||||
@@ -92,15 +98,14 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".novel-item",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
html, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("catalogue page fetch failed",
|
||||
@@ -173,15 +178,14 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
|
||||
s.log.Debug("metadata fetch starting",
|
||||
"payload_url", bookURL,
|
||||
"payload_wait_selector", ".novel-title",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: bookURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
|
||||
@@ -198,8 +202,11 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
|
||||
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
|
||||
// <span class="author"><a>Author Name</a></span>
|
||||
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
|
||||
// <img class="cover" src="...">
|
||||
cover := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "img", Class: "cover", Attr: "src"})
|
||||
// <figure class="cover"><img src="..."></figure>
|
||||
var cover string
|
||||
if figureCover := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); figureCover != nil {
|
||||
cover = htmlutil.ExtractFirst(figureCover, scraper.Selector{Tag: "img", Attr: "src"})
|
||||
}
|
||||
// <span class="status">Ongoing</span>
|
||||
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
|
||||
|
||||
@@ -245,32 +252,41 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
|
||||
|
||||
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
|
||||
var refs []scraper.ChapterRef
|
||||
// Chapter list URL: {bookURL}/chapters
|
||||
pageURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
// Chapter list URL: {bookURL}/chapters?page=N
|
||||
baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
page := 1
|
||||
|
||||
for pageURL != "" {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return refs, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page)
|
||||
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
|
||||
|
||||
s.log.Debug("chapter list fetch starting",
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".chapter-list",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
"payload_wait_selector_timeout_ms", 15000,
|
||||
"payload_wait_timeout_ms", 2000,
|
||||
"strategy", s.urlClient.Strategy(),
|
||||
)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
raw, err := s.urlClient.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
// Wait up to 15 s for the chapter list container to appear in the DOM.
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 15000},
|
||||
// After the selector is found, wait an additional 2 s for any
|
||||
// deferred JS rendering (lazy-loaded links, infinite-scroll hydration).
|
||||
WaitForTimeout: 2000,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
// Do NOT use BestAttempt — we want a complete page or a clear error,
|
||||
// not silently partial HTML that looks like "no more chapters".
|
||||
BestAttempt: false,
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("chapter list fetch failed",
|
||||
@@ -293,10 +309,27 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
|
||||
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
|
||||
if chapterList == nil {
|
||||
// No chapter list container on this page — we've gone past the last page.
|
||||
s.log.Debug("chapter list container not found, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
|
||||
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
|
||||
|
||||
s.log.Debug("chapter list page parsed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"chapters_on_page", len(items),
|
||||
"total_refs_so_far", len(refs),
|
||||
)
|
||||
|
||||
// Zero items on this page means we've gone past the last page.
|
||||
if len(items) == 0 {
|
||||
s.log.Debug("no chapters on page, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
@@ -308,7 +341,15 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
continue
|
||||
}
|
||||
chURL := resolveURL(baseURL, href)
|
||||
num := len(refs) + 1
|
||||
num := chapterNumberFromURL(chURL)
|
||||
if num <= 0 {
|
||||
// Fall back to position if the URL has no parseable number.
|
||||
num = len(refs) + 1
|
||||
s.log.Warn("chapter number not parseable from URL, falling back to position",
|
||||
"url", chURL,
|
||||
"position", num,
|
||||
)
|
||||
}
|
||||
refs = append(refs, scraper.ChapterRef{
|
||||
Number: num,
|
||||
Title: strings.TrimSpace(chTitle),
|
||||
@@ -316,30 +357,134 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
})
|
||||
}
|
||||
|
||||
s.log.Debug("chapter list page parsed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"chapters_on_page", len(items),
|
||||
"total_refs_so_far", len(refs),
|
||||
)
|
||||
|
||||
// Next page: <a class="next" href="...">
|
||||
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
|
||||
if nextHref == "" {
|
||||
break
|
||||
}
|
||||
pageURL = resolveURL(baseURL, nextHref)
|
||||
page++
|
||||
}
|
||||
|
||||
return refs, nil
|
||||
}
|
||||
|
||||
// ─── RankingProvider ───────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) ScrapeRanking(ctx context.Context) (<-chan scraper.BookMeta, <-chan error) {
|
||||
entries := make(chan scraper.BookMeta, 64)
|
||||
errs := make(chan error, 16)
|
||||
|
||||
go func() {
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
pageURL := baseURL + rankingPath
|
||||
rank := 1
|
||||
|
||||
for pageURL != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
s.log.Info("scraping ranking page", "url", pageURL)
|
||||
|
||||
// Use WaitFor only for browser-based strategies
|
||||
var raw string
|
||||
var err error
|
||||
if s.client.Strategy() == browser.StrategyDirect {
|
||||
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
})
|
||||
} else {
|
||||
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".rank-novels", Timeout: 30000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
BestAttempt: true,
|
||||
})
|
||||
}
|
||||
if err != nil {
|
||||
s.log.Debug("ranking page fetch failed", "url", pageURL, "err", err)
|
||||
errs <- fmt.Errorf("ranking page: %w", err)
|
||||
return
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("ranking page parse: %w", err)
|
||||
return
|
||||
}
|
||||
|
||||
rankList := htmlutil.FindFirst(root, scraper.Selector{Class: "rank-novels"})
|
||||
if rankList == nil {
|
||||
break
|
||||
}
|
||||
|
||||
items := htmlutil.FindAll(rankList, scraper.Selector{Tag: "li", Class: "novel-item"})
|
||||
for _, item := range items {
|
||||
// Cover: <figure class="cover"><a href="/book/slug"><img data-src="..."></a></figure>
|
||||
var cover string
|
||||
if fig := htmlutil.FindFirst(item, scraper.Selector{Tag: "figure", Class: "cover"}); fig != nil {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"})
|
||||
if cover != "" {
|
||||
cover = baseURL + cover
|
||||
}
|
||||
}
|
||||
|
||||
// Title and URL: <h2 class="title"><a href="/book/slug">Title</a></h2>
|
||||
titleNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "h2", Class: "title"})
|
||||
var title, bookURL string
|
||||
if titleNode != nil {
|
||||
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a"})
|
||||
if linkNode != nil {
|
||||
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
||||
bookURL = resolveURL(baseURL, href)
|
||||
}
|
||||
}
|
||||
|
||||
// Status: <span class="status"> Ongoing/Completed </span>
|
||||
status := htmlutil.ExtractFirst(item, scraper.Selector{Tag: "span", Class: "status"})
|
||||
|
||||
// Genres: <div class="categories"><div class="scroll"><span>Genre1</span><span>Genre2</span>...</div></div>
|
||||
var genres []string
|
||||
categoriesNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "div", Class: "categories"})
|
||||
if categoriesNode != nil {
|
||||
genres = htmlutil.ExtractAll(categoriesNode, scraper.Selector{Tag: "span", Multiple: true})
|
||||
}
|
||||
|
||||
slug := slugFromURL(bookURL)
|
||||
|
||||
meta := scraper.BookMeta{
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Cover: cover,
|
||||
Status: strings.TrimSpace(status),
|
||||
Genres: genres,
|
||||
SourceURL: bookURL,
|
||||
Ranking: rank,
|
||||
}
|
||||
rank++
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case entries <- meta:
|
||||
}
|
||||
}
|
||||
|
||||
// Next page - ranking pages use different pagination, just get first page for now
|
||||
break
|
||||
}
|
||||
}()
|
||||
|
||||
return entries, errs
|
||||
}
|
||||
|
||||
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
|
||||
|
||||
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
|
||||
// exponentially between retries. Only errors that look like transient Browserless
|
||||
// 5xx responses (navigation timeouts, etc.) are retried; context cancellation and
|
||||
// failures (timeouts, 5xx responses) are retried; context cancellation and
|
||||
// permanent errors are returned immediately.
|
||||
func retryGetContent(
|
||||
ctx context.Context,
|
||||
@@ -363,11 +508,6 @@ func retryGetContent(
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Only retry on Browserless 5xx responses.
|
||||
if !strings.Contains(err.Error(), "unexpected status 5") {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if attempt < maxAttempts {
|
||||
log.Warn("chapter fetch failed, retrying",
|
||||
"url", req.URL,
|
||||
@@ -393,15 +533,15 @@ func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef)
|
||||
"title", ref.Title,
|
||||
"payload_url", ref.URL,
|
||||
"payload_wait_selector", "#content",
|
||||
"payload_wait_selector_timeout_ms", 75000,
|
||||
"payload_wait_for_timeout_ms", 75000,
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
|
||||
URL: ref.URL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 75000},
|
||||
WaitForTimeout: 75000,
|
||||
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
BestAttempt: true,
|
||||
}, 9, 6*time.Second)
|
||||
if err != nil {
|
||||
s.log.Debug("chapter text fetch failed",
|
||||
@@ -411,6 +551,18 @@ func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef)
|
||||
)
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
|
||||
}
|
||||
if len(raw) > 0 {
|
||||
preview := raw
|
||||
if len(preview) > 500 {
|
||||
preview = preview[:500]
|
||||
}
|
||||
s.log.Debug("chapter text fetch partial content",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"response_bytes", len(raw),
|
||||
"preview", preview,
|
||||
)
|
||||
}
|
||||
s.log.Debug("chapter text fetch completed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
@@ -484,3 +636,30 @@ func parseChapterCount(s string) int {
|
||||
n, _ := strconv.Atoi(fields[0])
|
||||
return n
|
||||
}
|
||||
|
||||
// chapterNumberFromURL extracts the chapter number from a novelfire chapter URL.
|
||||
//
|
||||
// URL pattern: https://novelfire.net/book/{book-slug}/chapter-{N}
|
||||
// The last path segment is expected to be "chapter-{N}" or "{N}".
|
||||
// Returns 0 if no number can be parsed.
|
||||
func chapterNumberFromURL(chapterURL string) int {
|
||||
u, err := url.Parse(chapterURL)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
seg := path.Base(u.Path) // e.g. "chapter-42" or "42"
|
||||
// Strip a "chapter-" prefix if present.
|
||||
seg = strings.TrimPrefix(seg, "chapter-")
|
||||
// Also handle "chap-", "ch-" variants used by some sites.
|
||||
seg = strings.TrimPrefix(seg, "chap-")
|
||||
seg = strings.TrimPrefix(seg, "ch-")
|
||||
// Take only the leading digits (handles slugs like "42-title-text").
|
||||
digits := strings.FieldsFunc(seg, func(r rune) bool {
|
||||
return r < '0' || r > '9'
|
||||
})
|
||||
if len(digits) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.Atoi(digits[0])
|
||||
return n
|
||||
}
|
||||
|
||||
217
scraper/internal/novelfire/scraper_test.go
Normal file
217
scraper/internal/novelfire/scraper_test.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
)
|
||||
|
||||
// ── stub browser client ───────────────────────────────────────────────────────
|
||||
|
||||
// stubClient is a BrowserClient that returns a fixed HTML string for every
|
||||
// GetContent call. ScrapePage and CDPSession are not used by these tests.
|
||||
type stubClient struct {
|
||||
html string
|
||||
}
|
||||
|
||||
func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||||
|
||||
func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||||
return s.html, nil
|
||||
}
|
||||
|
||||
func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||||
return browser.ScrapeResponse{}, nil
|
||||
}
|
||||
|
||||
func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// pagedStubClient returns a different HTML response for each successive call.
|
||||
// Once all pages are exhausted it returns an empty page (no chapter-list),
|
||||
// simulating the paginated chapter-list endpoint terminating correctly.
|
||||
type pagedStubClient struct {
|
||||
pages []string
|
||||
call int
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||||
|
||||
func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||||
if c.call < len(c.pages) {
|
||||
html := c.pages[c.call]
|
||||
c.call++
|
||||
return html, nil
|
||||
}
|
||||
// Past the last page — return a page with no chapter-list to stop pagination.
|
||||
return `<!DOCTYPE html><html><body><div class="no-content"></div></body></html>`, nil
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||||
return browser.ScrapeResponse{}, nil
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func newScraper(html string) *Scraper {
|
||||
return New(&stubClient{html: html}, nil, &stubClient{html: html})
|
||||
}
|
||||
|
||||
func newPagedScraper(pages ...string) *Scraper {
|
||||
urlClient := &pagedStubClient{pages: pages}
|
||||
return New(&stubClient{}, nil, urlClient)
|
||||
}
|
||||
|
||||
// ── ScrapeChapterText ─────────────────────────────────────────────────────────
|
||||
|
||||
func TestScrapeChapterText_ExtractsInnerText(t *testing.T) {
|
||||
html := `<!DOCTYPE html><html><body>
|
||||
<div id="content">
|
||||
<p>It was a dark and stormy night.</p>
|
||||
<p>The hero stepped forward.</p>
|
||||
</div>
|
||||
</body></html>`
|
||||
|
||||
s := newScraper(html)
|
||||
ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"}
|
||||
|
||||
ch, err := s.ScrapeChapterText(context.Background(), ref)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if ch.Ref.Number != 1 {
|
||||
t.Errorf("expected chapter number 1, got %d", ch.Ref.Number)
|
||||
}
|
||||
if !strings.Contains(ch.Text, "dark and stormy") {
|
||||
t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text)
|
||||
}
|
||||
if !strings.Contains(ch.Text, "hero stepped forward") {
|
||||
t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapeChapterText_MissingContainer(t *testing.T) {
|
||||
html := `<!DOCTYPE html><html><body><div class="other">nothing here</div></body></html>`
|
||||
|
||||
s := newScraper(html)
|
||||
ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"}
|
||||
|
||||
_, err := s.ScrapeChapterText(context.Background(), ref)
|
||||
if err == nil {
|
||||
t.Fatal("expected an error when #content container is missing, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// ── chapterNumberFromURL ──────────────────────────────────────────────────────
|
||||
|
||||
func TestChapterNumberFromURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
url string
|
||||
want int
|
||||
}{
|
||||
// Standard novelfire pattern.
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1},
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26},
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58},
|
||||
// Large chapter numbers.
|
||||
{"https://novelfire.net/book/some-novel/chapter-1000", 1000},
|
||||
// Path segment with trailing slash.
|
||||
{"https://novelfire.net/book/some-novel/chapter-5/", 5},
|
||||
// Slug with title appended after the number (hypothetical future format).
|
||||
{"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42},
|
||||
// Unparseable — should return 0 so the caller can fall back.
|
||||
{"https://novelfire.net/book/some-novel/prologue", 0},
|
||||
{"https://novelfire.net/book/some-novel/", 0},
|
||||
{"not-a-url", 0},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
got := chapterNumberFromURL(tc.url)
|
||||
if got != tc.want {
|
||||
t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── ScrapeChapterList (position vs URL numbering) ─────────────────────────────
|
||||
|
||||
// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML
|
||||
// is served newest-first (as novelfire.net does), chapter numbers are still
|
||||
// assigned from the URL — not from list position — so that a re-run correctly
|
||||
// identifies which chapters are already on disk.
|
||||
func TestScrapeChapterList_NumbersFromURL(t *testing.T) {
|
||||
// Simulate a newest-first chapter list with 5 chapters on a single page.
|
||||
// Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML.
|
||||
page1 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
s := newPagedScraper(page1)
|
||||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(refs) != 5 {
|
||||
t.Fatalf("expected 5 refs, got %d", len(refs))
|
||||
}
|
||||
|
||||
// With position-based numbering (the old bug), refs[0].Number would be 1
|
||||
// even though its URL is /chapter-5. With URL-based numbering it must be 5.
|
||||
wantNumbers := []int{5, 4, 3, 2, 1}
|
||||
for i, ref := range refs {
|
||||
if ref.Number != wantNumbers[i] {
|
||||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestScrapeChapterList_Pagination verifies that the scraper correctly follows
|
||||
// ?page=N pagination and stops when a page returns no chapter items.
|
||||
func TestScrapeChapterList_Pagination(t *testing.T) {
|
||||
page1 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
page2 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-6">Chapter 6</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
// page3 is omitted — pagedStubClient will return empty page to stop pagination.
|
||||
s := newPagedScraper(page1, page2)
|
||||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(refs) != 6 {
|
||||
t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs))
|
||||
}
|
||||
|
||||
wantNumbers := []int{3, 2, 1, 6, 5, 4}
|
||||
for i, ref := range refs {
|
||||
if ref.Number != wantNumbers[i] {
|
||||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,8 @@ type BookMeta struct {
|
||||
TotalChapters int `yaml:"total_chapters,omitempty"`
|
||||
// SourceURL is the canonical URL of the book's landing page.
|
||||
SourceURL string `yaml:"source_url"`
|
||||
// Ranking is the rank number from ranking pages.
|
||||
Ranking int `yaml:"ranking,omitempty"`
|
||||
}
|
||||
|
||||
// CatalogueEntry is a lightweight reference returned by CatalogueProvider.
|
||||
@@ -108,6 +110,13 @@ type ChapterTextProvider interface {
|
||||
ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error)
|
||||
}
|
||||
|
||||
// RankingProvider can enumerate novels from a ranking page.
|
||||
type RankingProvider interface {
|
||||
// ScrapeRanking pages through the ranking list, sending BookMeta values
|
||||
// (with basic info like title, cover, genres, status, sourceURL) to the returned channel.
|
||||
ScrapeRanking(ctx context.Context) (<-chan BookMeta, <-chan error)
|
||||
}
|
||||
|
||||
// NovelScraper is the full interface that a concrete novel source must implement.
|
||||
// It composes all four provider interfaces.
|
||||
type NovelScraper interface {
|
||||
@@ -115,6 +124,7 @@ type NovelScraper interface {
|
||||
MetadataProvider
|
||||
ChapterListProvider
|
||||
ChapterTextProvider
|
||||
RankingProvider
|
||||
|
||||
// SourceName returns the human-readable name of this scraper, e.g. "novelfire.net".
|
||||
SourceName() string
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -23,23 +24,27 @@ import (
|
||||
|
||||
// Server wraps an HTTP mux with the scraping endpoints.
|
||||
type Server struct {
|
||||
addr string
|
||||
oCfg orchestrator.Config
|
||||
novel scraper.NovelScraper
|
||||
log *slog.Logger
|
||||
writer *writer.Writer
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
addr string
|
||||
oCfg orchestrator.Config
|
||||
novel scraper.NovelScraper
|
||||
log *slog.Logger
|
||||
writer *writer.Writer
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
kokoroURL string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880
|
||||
kokoroVoice string // default voice, e.g. af_bella
|
||||
}
|
||||
|
||||
// New creates a new Server.
|
||||
func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger) *Server {
|
||||
func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger, kokoroURL, kokoroVoice string) *Server {
|
||||
return &Server{
|
||||
addr: addr,
|
||||
oCfg: oCfg,
|
||||
novel: novel,
|
||||
log: log,
|
||||
writer: writer.New(oCfg.StaticRoot),
|
||||
addr: addr,
|
||||
oCfg: oCfg,
|
||||
novel: novel,
|
||||
log: log,
|
||||
writer: writer.New(oCfg.StaticRoot),
|
||||
kokoroURL: kokoroURL,
|
||||
kokoroVoice: kokoroVoice,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,16 +57,21 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
|
||||
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
|
||||
// UI routes
|
||||
mux.HandleFunc("GET /", s.handleHome)
|
||||
mux.HandleFunc("GET /ranking", s.handleRanking)
|
||||
mux.HandleFunc("POST /ranking/refresh", s.handleRankingRefresh)
|
||||
mux.HandleFunc("GET /ranking/view", s.handleRankingView)
|
||||
mux.HandleFunc("GET /books/{slug}", s.handleBook)
|
||||
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
|
||||
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
|
||||
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
|
||||
// Plain-text chapter content for browser-side TTS
|
||||
mux.HandleFunc("GET /ui/chapter-text/{slug}/{n}", s.handleChapterText)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: s.addr,
|
||||
Handler: mux,
|
||||
ReadTimeout: 15 * time.Second,
|
||||
WriteTimeout: 15 * time.Second,
|
||||
WriteTimeout: 60 * time.Second,
|
||||
IdleTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
@@ -85,6 +95,25 @@ func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
|
||||
}
|
||||
|
||||
// handleChapterText returns the plain text of a chapter (markdown stripped)
|
||||
// for browser-side TTS. The browser POSTs this directly to Kokoro-FastAPI.
|
||||
func (s *Server) handleChapterText(w http.ResponseWriter, r *http.Request) {
|
||||
slug := r.PathValue("slug")
|
||||
n, err := strconv.Atoi(r.PathValue("n"))
|
||||
if err != nil || n < 1 {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
raw, err := s.writer.ReadChapter(slug, n)
|
||||
if err != nil {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, stripMarkdown(raw))
|
||||
}
|
||||
|
||||
func (s *Server) handleScrapeCatalogue(w http.ResponseWriter, r *http.Request) {
|
||||
cfg := s.oCfg
|
||||
cfg.SingleBookURL = "" // full catalogue
|
||||
|
||||
1331
scraper/internal/server/ui.go
Normal file
1331
scraper/internal/server/ui.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -19,6 +19,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -144,10 +145,32 @@ func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
|
||||
return books, nil
|
||||
}
|
||||
|
||||
// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk.
|
||||
// It is cheaper than ListBooks because it only checks for file existence rather
|
||||
// than fully parsing every YAML file.
|
||||
func (w *Writer) LocalSlugs() map[string]bool {
|
||||
entries, err := os.ReadDir(w.root)
|
||||
if err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
slugs := make(map[string]bool, len(entries))
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml")
|
||||
if _, err := os.Stat(metaPath); err == nil {
|
||||
slugs[e.Name()] = true
|
||||
}
|
||||
}
|
||||
return slugs
|
||||
}
|
||||
|
||||
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
|
||||
type ChapterInfo struct {
|
||||
Number int
|
||||
Title string // first line of the markdown file (without the leading "# ")
|
||||
Title string // chapter name, cleaned of number prefix and trailing date
|
||||
Date string // relative date scraped alongside the title, e.g. "1 year ago"
|
||||
}
|
||||
|
||||
// ListChapters returns all chapters on disk for slug, sorted by number.
|
||||
@@ -171,8 +194,8 @@ func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
title := chapterTitle(f, n)
|
||||
chapters = append(chapters, ChapterInfo{Number: n, Title: title})
|
||||
title, date := chapterTitle(f, n)
|
||||
chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -184,19 +207,55 @@ func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
|
||||
|
||||
// chapterTitle reads the first non-empty line of a markdown file and strips
|
||||
// the leading "# " heading marker. Falls back to "Chapter N".
|
||||
func chapterTitle(path string, n int) string {
|
||||
func chapterTitle(path string, n int) (title, date string) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Sprintf("Chapter %d", n)
|
||||
return fmt.Sprintf("Chapter %d", n), ""
|
||||
}
|
||||
for _, line := range strings.SplitN(string(data), "\n", 10) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
return strings.TrimPrefix(line, "# ")
|
||||
line = strings.TrimPrefix(line, "# ")
|
||||
return splitChapterTitle(line)
|
||||
}
|
||||
return fmt.Sprintf("Chapter %d", n)
|
||||
return fmt.Sprintf("Chapter %d", n), ""
|
||||
}
|
||||
|
||||
// splitChapterTitle separates the human-readable chapter name from the
|
||||
// trailing relative-date string that novelfire.net appends to the heading.
|
||||
// Examples of raw heading text (after stripping "# "):
|
||||
//
|
||||
// "1 Chapter 1 - 1: The Academy's Weakest1 year ago"
|
||||
// "2 Chapter 2 - Enter the Storm3 months ago"
|
||||
//
|
||||
// The pattern is: optional leading number+whitespace, then the real title,
|
||||
// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/
|
||||
func splitChapterTitle(raw string) (title, date string) {
|
||||
// Strip a leading chapter-number index that novelfire sometimes prepends.
|
||||
// It looks like "1 " or "12 " at the very start.
|
||||
raw = strings.TrimSpace(raw)
|
||||
if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 {
|
||||
prefix := raw[:idx]
|
||||
allDigit := true
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
allDigit = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allDigit {
|
||||
raw = strings.TrimSpace(raw[idx:])
|
||||
}
|
||||
}
|
||||
|
||||
// Match a trailing relative date: "<n> <unit>[s] ago"
|
||||
dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`)
|
||||
if m := dateRe.FindStringSubmatchIndex(raw); m != nil {
|
||||
return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]])
|
||||
}
|
||||
return raw, ""
|
||||
}
|
||||
|
||||
// ReadChapter returns the raw markdown content for chapter number n of slug.
|
||||
@@ -211,6 +270,139 @@ func (w *Writer) ReadChapter(slug string, n int) (string, error) {
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// ─── Ranking ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// RankingItem represents a single entry in the ranking.
|
||||
type RankingItem struct {
|
||||
Rank int `yaml:"rank"`
|
||||
Slug string `yaml:"slug"`
|
||||
Title string `yaml:"title"`
|
||||
Author string `yaml:"author,omitempty"`
|
||||
Cover string `yaml:"cover,omitempty"`
|
||||
Status string `yaml:"status,omitempty"`
|
||||
Genres []string `yaml:"genres,omitempty"`
|
||||
SourceURL string `yaml:"source_url,omitempty"`
|
||||
}
|
||||
|
||||
// WriteRanking saves the ranking items as markdown to static/ranking.md.
|
||||
func (w *Writer) WriteRanking(items []RankingItem) error {
|
||||
path := filepath.Clean(w.rankingPath())
|
||||
dir := filepath.Dir(path)
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
sb.WriteString("# Novel Rankings\n\n")
|
||||
sb.WriteString("| Rank | Title | Cover | Status | Genres | URL |\n")
|
||||
sb.WriteString("|------|-------|-------|--------|--------|-----|\n")
|
||||
for _, item := range items {
|
||||
genres := strings.Join(item.Genres, ", ")
|
||||
if genres == "" {
|
||||
genres = "-"
|
||||
}
|
||||
sb.WriteString(fmt.Sprintf("| %d | %s | %s | %s | %s | %s |\n",
|
||||
item.Rank, item.Title, item.Cover, item.Status, genres, item.SourceURL))
|
||||
}
|
||||
|
||||
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
|
||||
return fmt.Errorf("writer: write ranking %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReadRanking reads the ranking.md file if it exists.
|
||||
func (w *Writer) ReadRanking() (string, error) {
|
||||
path := w.rankingPath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return "", nil
|
||||
}
|
||||
return "", fmt.Errorf("writer: read ranking: %w", err)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// ReadRankingItems parses ranking.md back into a slice of RankingItem.
|
||||
// Returns nil slice (not an error) when the file does not exist yet.
|
||||
func (w *Writer) ReadRankingItems() ([]RankingItem, error) {
|
||||
markdown, err := w.ReadRanking()
|
||||
if err != nil || markdown == "" {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var items []RankingItem
|
||||
for _, line := range strings.Split(markdown, "\n") {
|
||||
// Only process data rows: start and end with '|', not header/separator rows.
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "|") || !strings.HasSuffix(line, "|") {
|
||||
continue
|
||||
}
|
||||
// Strip leading/trailing '|' and split on '|'.
|
||||
inner := strings.TrimPrefix(strings.TrimSuffix(line, "|"), "|")
|
||||
cols := strings.Split(inner, "|")
|
||||
if len(cols) < 6 {
|
||||
continue
|
||||
}
|
||||
for i, c := range cols {
|
||||
cols[i] = strings.TrimSpace(c)
|
||||
}
|
||||
// Skip header row and separator row.
|
||||
if cols[0] == "Rank" || strings.HasPrefix(cols[0], "---") {
|
||||
continue
|
||||
}
|
||||
rank, err := strconv.Atoi(cols[0])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
title := cols[1]
|
||||
cover := cols[2]
|
||||
status := cols[3]
|
||||
genresStr := cols[4]
|
||||
sourceURL := cols[5]
|
||||
|
||||
var genres []string
|
||||
if genresStr != "-" && genresStr != "" {
|
||||
for _, g := range strings.Split(genresStr, ",") {
|
||||
g = strings.TrimSpace(g)
|
||||
if g != "" {
|
||||
genres = append(genres, g)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Derive slug from source URL (last path segment).
|
||||
slug := ""
|
||||
if sourceURL != "" {
|
||||
parts := strings.Split(strings.TrimRight(sourceURL, "/"), "/")
|
||||
if len(parts) > 0 {
|
||||
slug = parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
|
||||
items = append(items, RankingItem{
|
||||
Rank: rank,
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Cover: cover,
|
||||
Status: status,
|
||||
Genres: genres,
|
||||
SourceURL: sourceURL,
|
||||
})
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
// RankingFileInfo returns os.FileInfo for the ranking.md file, if it exists.
|
||||
func (w *Writer) RankingFileInfo() (os.FileInfo, error) {
|
||||
return os.Stat(w.rankingPath())
|
||||
}
|
||||
|
||||
func (w *Writer) rankingPath() string {
|
||||
return filepath.Join(w.root, "ranking.md")
|
||||
}
|
||||
|
||||
// bookDir returns the root directory for a book slug.
|
||||
func (w *Writer) bookDir(slug string) string {
|
||||
return filepath.Join(w.root, slug)
|
||||
|
||||
Reference in New Issue
Block a user