Files
libnovel/scraper/internal/novelfire/integration_test.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

345 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build integration
// Integration tests for the novelfire.net Scraper against a live Browserless instance.
//
// These tests exercise the full scraping stack — Browserless → raw HTML →
// novelfire HTML parser — for the book:
//
// https://novelfire.net/book/a-dragon-against-the-whole-world
//
// They are gated behind the "integration" build tag so they never run in a
// normal `go test ./...` pass.
//
// Run with:
//
// BROWSERLESS_URL=http://localhost:3000 \
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
// go test -v -tags integration -timeout 600s \
// github.com/libnovel/scraper/internal/novelfire
package novelfire
import (
"context"
"fmt"
"os"
"strings"
"testing"
"time"
"github.com/libnovel/scraper/internal/browser"
"github.com/libnovel/scraper/internal/scraper"
)
const (
integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world"
integrationBookSlug = "a-dragon-against-the-whole-world"
integrationBookTitle = "A Dragon against the Whole World"
)
// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
// environment, constructs a real contentClient, and returns a novelfire Scraper
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
func newIntegrationScraper(t *testing.T) *Scraper {
t.Helper()
baseURL := os.Getenv("BROWSERLESS_URL")
if baseURL == "" {
t.Skip("BROWSERLESS_URL not set — skipping integration test")
}
client := browser.NewContentClient(browser.Config{
BaseURL: baseURL,
Token: os.Getenv("BROWSERLESS_TOKEN"),
Timeout: 120 * time.Second,
MaxConcurrent: 1,
})
return New(client, nil)
}
// ── Metadata ──────────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
// ScrapeMetadata fetches the book page and correctly parses at minimum
// the slug, title, and source URL.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeMetadata failed: %v", err)
}
t.Logf("slug: %s", meta.Slug)
t.Logf("title: %s", meta.Title)
t.Logf("author: %s", meta.Author)
t.Logf("status: %s", meta.Status)
t.Logf("genres: %v", meta.Genres)
t.Logf("total_chapters: %d", meta.TotalChapters)
t.Logf("source_url: %s", meta.SourceURL)
if meta.Slug != integrationBookSlug {
t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
}
if meta.Title == "" {
t.Error("title is empty")
}
if !strings.EqualFold(meta.Title, integrationBookTitle) {
// Warn rather than hard-fail — the site may reword the title.
t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
}
if meta.SourceURL != integrationBookURL {
t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
}
}
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
// every optional field (author, status, genres, summary, total_chapters) is
// populated. A missing field is a warning, not a hard failure, because the
// site may change its HTML structure.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeMetadata failed: %v", err)
}
type check struct {
field string
empty bool
}
checks := []check{
{"author", meta.Author == ""},
{"status", meta.Status == ""},
{"summary", meta.Summary == ""},
{"genres", len(meta.Genres) == 0},
{"total_chapters", meta.TotalChapters == 0},
}
for _, c := range checks {
if c.empty {
t.Errorf("field %q is empty — HTML selector may have broken", c.field)
}
}
// total_chapters must be a positive integer.
if meta.TotalChapters < 1 {
t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
}
}
// ── Chapter list ──────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
// ScrapeChapterList returns a non-empty slice of chapter references with
// valid URLs and numbers parsed from those URLs (not list position).
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
t.Logf("total refs returned: %d", len(refs))
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
// Every ref must have a non-empty URL pointing at the correct book.
for i, ref := range refs {
if ref.URL == "" {
t.Errorf("refs[%d].URL is empty", i)
}
if !strings.Contains(ref.URL, integrationBookSlug) {
t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
}
if ref.Number <= 0 {
t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
}
if ref.Title == "" {
t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
}
}
}
// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
// fix for the newest-first ordering bug: each ref's Number must equal the
// chapter number embedded in its URL, not its position in the list.
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
mismatches := 0
for i, ref := range refs {
wantNum := chapterNumberFromURL(ref.URL)
if wantNum <= 0 {
// URL has no parseable number — skip this entry.
continue
}
if ref.Number != wantNum {
t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
i, ref.Number, ref.URL, wantNum)
mismatches++
if mismatches >= 5 {
t.Log("… (further mismatches suppressed)")
break
}
}
}
// Log the first few refs so failures are easy to diagnose.
limit := 5
if len(refs) < limit {
limit = len(refs)
}
for i := 0; i < limit; i++ {
t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
}
}
// ── Chapters ──────────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
// via ScrapeChapterText and verifies each returns non-empty markdown text.
// Chapters are run as sub-tests so a single failure does not abort the others.
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
s := newIntegrationScraper(t)
chapters := []scraper.ChapterRef{
{
Number: 1,
Title: "Chapter 1",
URL: integrationBookURL + "/chapter-1",
},
{
Number: 2,
Title: "Chapter 2",
URL: integrationBookURL + "/chapter-2",
},
{
Number: 3,
Title: "Chapter 3",
URL: integrationBookURL + "/chapter-3",
},
}
for _, ref := range chapters {
ref := ref // capture
t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
// Sequential: each chapter needs its own generous timeout.
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
defer cancel()
ch, err := s.ScrapeChapterText(ctx, ref)
if err != nil {
t.Fatalf("ScrapeChapterText failed: %v", err)
}
t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
// Ref fields must be echoed back unchanged.
if ch.Ref.Number != ref.Number {
t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
}
if ch.Ref.URL != ref.URL {
t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
}
// Text must be non-trivially long.
if len(ch.Text) < 100 {
t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
len(ch.Text), ch.Text)
}
// Text must not contain raw HTML tags — NodeToMarkdown should have
// stripped them.
for _, tag := range []string{"<div", "<span", "<script", "<style"} {
if strings.Contains(ch.Text, tag) {
t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
}
}
})
}
}
// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
// variant: it first calls ScrapeChapterList to get the real refs (with
// URL-derived numbers), then scrapes chapters 13 using those refs.
// This catches any discrepancy between the list and the chapter URLs.
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
s := newIntegrationScraper(t)
// Step 1: fetch the chapter list.
listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
defer listCancel()
refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
// Build a map number→ref for fast lookup.
byNumber := make(map[int]scraper.ChapterRef, len(refs))
for _, r := range refs {
byNumber[r.Number] = r
}
// Step 2: scrape chapters 1, 2, 3.
for _, wantNum := range []int{1, 2, 3} {
wantNum := wantNum
ref, ok := byNumber[wantNum]
if !ok {
t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
continue
}
t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
defer cancel()
ch, err := s.ScrapeChapterText(ctx, ref)
if err != nil {
t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
}
t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
if len(ch.Text) < 100 {
t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
}
})
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
func truncateStr(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}