- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
345 lines
11 KiB
Go
345 lines
11 KiB
Go
//go:build integration
|
||
|
||
// Integration tests for the novelfire.net Scraper against a live Browserless instance.
|
||
//
|
||
// These tests exercise the full scraping stack — Browserless → raw HTML →
|
||
// novelfire HTML parser — for the book:
|
||
//
|
||
// https://novelfire.net/book/a-dragon-against-the-whole-world
|
||
//
|
||
// They are gated behind the "integration" build tag so they never run in a
|
||
// normal `go test ./...` pass.
|
||
//
|
||
// Run with:
|
||
//
|
||
// BROWSERLESS_URL=http://localhost:3000 \
|
||
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
||
// go test -v -tags integration -timeout 600s \
|
||
// github.com/libnovel/scraper/internal/novelfire
|
||
package novelfire
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"os"
|
||
"strings"
|
||
"testing"
|
||
"time"
|
||
|
||
"github.com/libnovel/scraper/internal/browser"
|
||
"github.com/libnovel/scraper/internal/scraper"
|
||
)
|
||
|
||
const (
|
||
integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world"
|
||
integrationBookSlug = "a-dragon-against-the-whole-world"
|
||
integrationBookTitle = "A Dragon against the Whole World"
|
||
)
|
||
|
||
// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
||
// environment, constructs a real contentClient, and returns a novelfire Scraper
|
||
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
|
||
func newIntegrationScraper(t *testing.T) *Scraper {
|
||
t.Helper()
|
||
baseURL := os.Getenv("BROWSERLESS_URL")
|
||
if baseURL == "" {
|
||
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
||
}
|
||
client := browser.NewContentClient(browser.Config{
|
||
BaseURL: baseURL,
|
||
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
||
Timeout: 120 * time.Second,
|
||
MaxConcurrent: 1,
|
||
})
|
||
return New(client, nil)
|
||
}
|
||
|
||
// ── Metadata ──────────────────────────────────────────────────────────────────
|
||
|
||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
|
||
// ScrapeMetadata fetches the book page and correctly parses at minimum
|
||
// the slug, title, and source URL.
|
||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||
defer cancel()
|
||
|
||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||
}
|
||
|
||
t.Logf("slug: %s", meta.Slug)
|
||
t.Logf("title: %s", meta.Title)
|
||
t.Logf("author: %s", meta.Author)
|
||
t.Logf("status: %s", meta.Status)
|
||
t.Logf("genres: %v", meta.Genres)
|
||
t.Logf("total_chapters: %d", meta.TotalChapters)
|
||
t.Logf("source_url: %s", meta.SourceURL)
|
||
|
||
if meta.Slug != integrationBookSlug {
|
||
t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
|
||
}
|
||
if meta.Title == "" {
|
||
t.Error("title is empty")
|
||
}
|
||
if !strings.EqualFold(meta.Title, integrationBookTitle) {
|
||
// Warn rather than hard-fail — the site may reword the title.
|
||
t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
|
||
}
|
||
if meta.SourceURL != integrationBookURL {
|
||
t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
|
||
}
|
||
}
|
||
|
||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
|
||
// every optional field (author, status, genres, summary, total_chapters) is
|
||
// populated. A missing field is a warning, not a hard failure, because the
|
||
// site may change its HTML structure.
|
||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||
defer cancel()
|
||
|
||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||
}
|
||
|
||
type check struct {
|
||
field string
|
||
empty bool
|
||
}
|
||
checks := []check{
|
||
{"author", meta.Author == ""},
|
||
{"status", meta.Status == ""},
|
||
{"summary", meta.Summary == ""},
|
||
{"genres", len(meta.Genres) == 0},
|
||
{"total_chapters", meta.TotalChapters == 0},
|
||
}
|
||
for _, c := range checks {
|
||
if c.empty {
|
||
t.Errorf("field %q is empty — HTML selector may have broken", c.field)
|
||
}
|
||
}
|
||
|
||
// total_chapters must be a positive integer.
|
||
if meta.TotalChapters < 1 {
|
||
t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
|
||
}
|
||
}
|
||
|
||
// ── Chapter list ──────────────────────────────────────────────────────────────
|
||
|
||
// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
|
||
// ScrapeChapterList returns a non-empty slice of chapter references with
|
||
// valid URLs and numbers parsed from those URLs (not list position).
|
||
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||
defer cancel()
|
||
|
||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||
}
|
||
|
||
t.Logf("total refs returned: %d", len(refs))
|
||
|
||
if len(refs) == 0 {
|
||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||
}
|
||
|
||
// Every ref must have a non-empty URL pointing at the correct book.
|
||
for i, ref := range refs {
|
||
if ref.URL == "" {
|
||
t.Errorf("refs[%d].URL is empty", i)
|
||
}
|
||
if !strings.Contains(ref.URL, integrationBookSlug) {
|
||
t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
|
||
}
|
||
if ref.Number <= 0 {
|
||
t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
|
||
}
|
||
if ref.Title == "" {
|
||
t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
|
||
// fix for the newest-first ordering bug: each ref's Number must equal the
|
||
// chapter number embedded in its URL, not its position in the list.
|
||
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||
defer cancel()
|
||
|
||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||
}
|
||
if len(refs) == 0 {
|
||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||
}
|
||
|
||
mismatches := 0
|
||
for i, ref := range refs {
|
||
wantNum := chapterNumberFromURL(ref.URL)
|
||
if wantNum <= 0 {
|
||
// URL has no parseable number — skip this entry.
|
||
continue
|
||
}
|
||
if ref.Number != wantNum {
|
||
t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
|
||
i, ref.Number, ref.URL, wantNum)
|
||
mismatches++
|
||
if mismatches >= 5 {
|
||
t.Log("… (further mismatches suppressed)")
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
// Log the first few refs so failures are easy to diagnose.
|
||
limit := 5
|
||
if len(refs) < limit {
|
||
limit = len(refs)
|
||
}
|
||
for i := 0; i < limit; i++ {
|
||
t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
|
||
}
|
||
}
|
||
|
||
// ── Chapters ──────────────────────────────────────────────────────────────────
|
||
|
||
// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
|
||
// via ScrapeChapterText and verifies each returns non-empty markdown text.
|
||
// Chapters are run as sub-tests so a single failure does not abort the others.
|
||
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
chapters := []scraper.ChapterRef{
|
||
{
|
||
Number: 1,
|
||
Title: "Chapter 1",
|
||
URL: integrationBookURL + "/chapter-1",
|
||
},
|
||
{
|
||
Number: 2,
|
||
Title: "Chapter 2",
|
||
URL: integrationBookURL + "/chapter-2",
|
||
},
|
||
{
|
||
Number: 3,
|
||
Title: "Chapter 3",
|
||
URL: integrationBookURL + "/chapter-3",
|
||
},
|
||
}
|
||
|
||
for _, ref := range chapters {
|
||
ref := ref // capture
|
||
t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
|
||
// Sequential: each chapter needs its own generous timeout.
|
||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||
defer cancel()
|
||
|
||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeChapterText failed: %v", err)
|
||
}
|
||
|
||
t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
|
||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||
|
||
// Ref fields must be echoed back unchanged.
|
||
if ch.Ref.Number != ref.Number {
|
||
t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
|
||
}
|
||
if ch.Ref.URL != ref.URL {
|
||
t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
|
||
}
|
||
|
||
// Text must be non-trivially long.
|
||
if len(ch.Text) < 100 {
|
||
t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
|
||
len(ch.Text), ch.Text)
|
||
}
|
||
|
||
// Text must not contain raw HTML tags — NodeToMarkdown should have
|
||
// stripped them.
|
||
for _, tag := range []string{"<div", "<span", "<script", "<style"} {
|
||
if strings.Contains(ch.Text, tag) {
|
||
t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
|
||
}
|
||
}
|
||
})
|
||
}
|
||
}
|
||
|
||
// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
|
||
// variant: it first calls ScrapeChapterList to get the real refs (with
|
||
// URL-derived numbers), then scrapes chapters 1–3 using those refs.
|
||
// This catches any discrepancy between the list and the chapter URLs.
|
||
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
|
||
s := newIntegrationScraper(t)
|
||
|
||
// Step 1: fetch the chapter list.
|
||
listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||
defer listCancel()
|
||
|
||
refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||
}
|
||
if len(refs) == 0 {
|
||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||
}
|
||
|
||
// Build a map number→ref for fast lookup.
|
||
byNumber := make(map[int]scraper.ChapterRef, len(refs))
|
||
for _, r := range refs {
|
||
byNumber[r.Number] = r
|
||
}
|
||
|
||
// Step 2: scrape chapters 1, 2, 3.
|
||
for _, wantNum := range []int{1, 2, 3} {
|
||
wantNum := wantNum
|
||
ref, ok := byNumber[wantNum]
|
||
if !ok {
|
||
t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
|
||
continue
|
||
}
|
||
|
||
t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
|
||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||
defer cancel()
|
||
|
||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||
if err != nil {
|
||
t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
|
||
}
|
||
|
||
t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
|
||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||
|
||
if len(ch.Text) < 100 {
|
||
t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
|
||
}
|
||
})
|
||
}
|
||
}
|
||
|
||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
func truncateStr(s string, n int) string {
|
||
if len(s) <= n {
|
||
return s
|
||
}
|
||
return s[:n] + "…"
|
||
}
|