Files
libnovel/scraper/internal/novelfire/integration_test.go
Admin e7b915c6aa chore: update Browserless port references from 3000 to 3030
Update all default URLs, port mappings, healthcheck endpoints, Dockerfile
ENV defaults, and integration test run instructions to use port 3030.
2026-03-01 14:51:28 +05:00

345 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build integration
// Integration tests for the novelfire.net Scraper against a live Browserless instance.
//
// These tests exercise the full scraping stack — Browserless → raw HTML →
// novelfire HTML parser — for the book:
//
// https://novelfire.net/book/a-dragon-against-the-whole-world
//
// They are gated behind the "integration" build tag so they never run in a
// normal `go test ./...` pass.
//
// Run with:
//
// BROWSERLESS_URL=http://localhost:3030 \
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
// go test -v -tags integration -timeout 600s \
// github.com/libnovel/scraper/internal/novelfire
package novelfire
import (
"context"
"fmt"
"os"
"strings"
"testing"
"time"
"github.com/libnovel/scraper/internal/browser"
"github.com/libnovel/scraper/internal/scraper"
)
const (
integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world"
integrationBookSlug = "a-dragon-against-the-whole-world"
integrationBookTitle = "A Dragon against the Whole World"
)
// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
// environment, constructs a real contentClient, and returns a novelfire Scraper
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
func newIntegrationScraper(t *testing.T) *Scraper {
t.Helper()
baseURL := os.Getenv("BROWSERLESS_URL")
if baseURL == "" {
t.Skip("BROWSERLESS_URL not set — skipping integration test")
}
client := browser.NewContentClient(browser.Config{
BaseURL: baseURL,
Token: os.Getenv("BROWSERLESS_TOKEN"),
Timeout: 120 * time.Second,
MaxConcurrent: 1,
})
return New(client, nil)
}
// ── Metadata ──────────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
// ScrapeMetadata fetches the book page and correctly parses at minimum
// the slug, title, and source URL.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeMetadata failed: %v", err)
}
t.Logf("slug: %s", meta.Slug)
t.Logf("title: %s", meta.Title)
t.Logf("author: %s", meta.Author)
t.Logf("status: %s", meta.Status)
t.Logf("genres: %v", meta.Genres)
t.Logf("total_chapters: %d", meta.TotalChapters)
t.Logf("source_url: %s", meta.SourceURL)
if meta.Slug != integrationBookSlug {
t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
}
if meta.Title == "" {
t.Error("title is empty")
}
if !strings.EqualFold(meta.Title, integrationBookTitle) {
// Warn rather than hard-fail — the site may reword the title.
t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
}
if meta.SourceURL != integrationBookURL {
t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
}
}
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
// every optional field (author, status, genres, summary, total_chapters) is
// populated. A missing field is a warning, not a hard failure, because the
// site may change its HTML structure.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeMetadata failed: %v", err)
}
type check struct {
field string
empty bool
}
checks := []check{
{"author", meta.Author == ""},
{"status", meta.Status == ""},
{"summary", meta.Summary == ""},
{"genres", len(meta.Genres) == 0},
{"total_chapters", meta.TotalChapters == 0},
}
for _, c := range checks {
if c.empty {
t.Errorf("field %q is empty — HTML selector may have broken", c.field)
}
}
// total_chapters must be a positive integer.
if meta.TotalChapters < 1 {
t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
}
}
// ── Chapter list ──────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
// ScrapeChapterList returns a non-empty slice of chapter references with
// valid URLs and numbers parsed from those URLs (not list position).
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
t.Logf("total refs returned: %d", len(refs))
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
// Every ref must have a non-empty URL pointing at the correct book.
for i, ref := range refs {
if ref.URL == "" {
t.Errorf("refs[%d].URL is empty", i)
}
if !strings.Contains(ref.URL, integrationBookSlug) {
t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
}
if ref.Number <= 0 {
t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
}
if ref.Title == "" {
t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
}
}
}
// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
// fix for the newest-first ordering bug: each ref's Number must equal the
// chapter number embedded in its URL, not its position in the list.
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
s := newIntegrationScraper(t)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
mismatches := 0
for i, ref := range refs {
wantNum := chapterNumberFromURL(ref.URL)
if wantNum <= 0 {
// URL has no parseable number — skip this entry.
continue
}
if ref.Number != wantNum {
t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
i, ref.Number, ref.URL, wantNum)
mismatches++
if mismatches >= 5 {
t.Log("… (further mismatches suppressed)")
break
}
}
}
// Log the first few refs so failures are easy to diagnose.
limit := 5
if len(refs) < limit {
limit = len(refs)
}
for i := 0; i < limit; i++ {
t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
}
}
// ── Chapters ──────────────────────────────────────────────────────────────────
// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
// via ScrapeChapterText and verifies each returns non-empty markdown text.
// Chapters are run as sub-tests so a single failure does not abort the others.
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
s := newIntegrationScraper(t)
chapters := []scraper.ChapterRef{
{
Number: 1,
Title: "Chapter 1",
URL: integrationBookURL + "/chapter-1",
},
{
Number: 2,
Title: "Chapter 2",
URL: integrationBookURL + "/chapter-2",
},
{
Number: 3,
Title: "Chapter 3",
URL: integrationBookURL + "/chapter-3",
},
}
for _, ref := range chapters {
ref := ref // capture
t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
// Sequential: each chapter needs its own generous timeout.
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
defer cancel()
ch, err := s.ScrapeChapterText(ctx, ref)
if err != nil {
t.Fatalf("ScrapeChapterText failed: %v", err)
}
t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
// Ref fields must be echoed back unchanged.
if ch.Ref.Number != ref.Number {
t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
}
if ch.Ref.URL != ref.URL {
t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
}
// Text must be non-trivially long.
if len(ch.Text) < 100 {
t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
len(ch.Text), ch.Text)
}
// Text must not contain raw HTML tags — NodeToMarkdown should have
// stripped them.
for _, tag := range []string{"<div", "<span", "<script", "<style"} {
if strings.Contains(ch.Text, tag) {
t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
}
}
})
}
}
// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
// variant: it first calls ScrapeChapterList to get the real refs (with
// URL-derived numbers), then scrapes chapters 13 using those refs.
// This catches any discrepancy between the list and the chapter URLs.
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
s := newIntegrationScraper(t)
// Step 1: fetch the chapter list.
listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
defer listCancel()
refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
if err != nil {
t.Fatalf("ScrapeChapterList failed: %v", err)
}
if len(refs) == 0 {
t.Fatal("ScrapeChapterList returned 0 refs")
}
// Build a map number→ref for fast lookup.
byNumber := make(map[int]scraper.ChapterRef, len(refs))
for _, r := range refs {
byNumber[r.Number] = r
}
// Step 2: scrape chapters 1, 2, 3.
for _, wantNum := range []int{1, 2, 3} {
wantNum := wantNum
ref, ok := byNumber[wantNum]
if !ok {
t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
continue
}
t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
defer cancel()
ch, err := s.ScrapeChapterText(ctx, ref)
if err != nil {
t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
}
t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
if len(ch.Text) < 100 {
t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
}
})
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
func truncateStr(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}