libnovel/scraper/internal/novelfire/integration_test.go

//go:build integration

// Integration tests for the novelfire.net Scraper against a live Browserless instance.
//
// These tests exercise the full scraping stack — Browserless → raw HTML →
// novelfire HTML parser — for the book:
//
//	https://novelfire.net/book/a-dragon-against-the-whole-world
//
// They are gated behind the "integration" build tag so they never run in a
// normal `go test ./...` pass.
//
// Run with:
//
//	BROWSERLESS_URL=http://localhost:3000 \
//	BROWSERLESS_TOKEN=your-token \        # omit if auth is disabled
//	go test -v -tags integration -timeout 600s \
//	    github.com/libnovel/scraper/internal/novelfire
package novelfire

import (
	"context"
	"fmt"
	"os"
	"strings"
	"testing"
	"time"

	"github.com/libnovel/scraper/internal/browser"
	"github.com/libnovel/scraper/internal/scraper"
)

const (
	integrationBookURL   = "https://novelfire.net/book/a-dragon-against-the-whole-world"
	integrationBookSlug  = "a-dragon-against-the-whole-world"
	integrationBookTitle = "A Dragon against the Whole World"
)

// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
// environment, constructs a real contentClient, and returns a novelfire Scraper
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
func newIntegrationScraper(t *testing.T) *Scraper {
	t.Helper()
	baseURL := os.Getenv("BROWSERLESS_URL")
	if baseURL == "" {
		t.Skip("BROWSERLESS_URL not set — skipping integration test")
	}
	client := browser.NewContentClient(browser.Config{
		BaseURL:       baseURL,
		Token:         os.Getenv("BROWSERLESS_TOKEN"),
		Timeout:       120 * time.Second,
		MaxConcurrent: 1,
	})
	return New(client, nil)
}

// ── Metadata ──────────────────────────────────────────────────────────────────

// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
// ScrapeMetadata fetches the book page and correctly parses at minimum
// the slug, title, and source URL.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
	s := newIntegrationScraper(t)

	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()

	meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
	if err != nil {
		t.Fatalf("ScrapeMetadata failed: %v", err)
	}

	t.Logf("slug:           %s", meta.Slug)
	t.Logf("title:          %s", meta.Title)
	t.Logf("author:         %s", meta.Author)
	t.Logf("status:         %s", meta.Status)
	t.Logf("genres:         %v", meta.Genres)
	t.Logf("total_chapters: %d", meta.TotalChapters)
	t.Logf("source_url:     %s", meta.SourceURL)

	if meta.Slug != integrationBookSlug {
		t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
	}
	if meta.Title == "" {
		t.Error("title is empty")
	}
	if !strings.EqualFold(meta.Title, integrationBookTitle) {
		// Warn rather than hard-fail — the site may reword the title.
		t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
	}
	if meta.SourceURL != integrationBookURL {
		t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
	}
}

// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
// every optional field (author, status, genres, summary, total_chapters) is
// populated. A missing field is a warning, not a hard failure, because the
// site may change its HTML structure.
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
	s := newIntegrationScraper(t)

	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()

	meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
	if err != nil {
		t.Fatalf("ScrapeMetadata failed: %v", err)
	}

	type check struct {
		field string
		empty bool
	}
	checks := []check{
		{"author", meta.Author == ""},
		{"status", meta.Status == ""},
		{"summary", meta.Summary == ""},
		{"genres", len(meta.Genres) == 0},
		{"total_chapters", meta.TotalChapters == 0},
	}
	for _, c := range checks {
		if c.empty {
			t.Errorf("field %q is empty — HTML selector may have broken", c.field)
		}
	}

	// total_chapters must be a positive integer.
	if meta.TotalChapters < 1 {
		t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
	}
}

// ── Chapter list ──────────────────────────────────────────────────────────────

// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
// ScrapeChapterList returns a non-empty slice of chapter references with
// valid URLs and numbers parsed from those URLs (not list position).
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
	s := newIntegrationScraper(t)

	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
	defer cancel()

	refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
	if err != nil {
		t.Fatalf("ScrapeChapterList failed: %v", err)
	}

	t.Logf("total refs returned: %d", len(refs))

	if len(refs) == 0 {
		t.Fatal("ScrapeChapterList returned 0 refs")
	}

	// Every ref must have a non-empty URL pointing at the correct book.
	for i, ref := range refs {
		if ref.URL == "" {
			t.Errorf("refs[%d].URL is empty", i)
		}
		if !strings.Contains(ref.URL, integrationBookSlug) {
			t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
		}
		if ref.Number <= 0 {
			t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
		}
		if ref.Title == "" {
			t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
		}
	}
}

// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
// fix for the newest-first ordering bug: each ref's Number must equal the
// chapter number embedded in its URL, not its position in the list.
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
	s := newIntegrationScraper(t)

	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
	defer cancel()

	refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
	if err != nil {
		t.Fatalf("ScrapeChapterList failed: %v", err)
	}
	if len(refs) == 0 {
		t.Fatal("ScrapeChapterList returned 0 refs")
	}

	mismatches := 0
	for i, ref := range refs {
		wantNum := chapterNumberFromURL(ref.URL)
		if wantNum <= 0 {
			// URL has no parseable number — skip this entry.
			continue
		}
		if ref.Number != wantNum {
			t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
				i, ref.Number, ref.URL, wantNum)
			mismatches++
			if mismatches >= 5 {
				t.Log("… (further mismatches suppressed)")
				break
			}
		}
	}

	// Log the first few refs so failures are easy to diagnose.
	limit := 5
	if len(refs) < limit {
		limit = len(refs)
	}
	for i := 0; i < limit; i++ {
		t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
	}
}

// ── Chapters ──────────────────────────────────────────────────────────────────

// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
// via ScrapeChapterText and verifies each returns non-empty markdown text.
// Chapters are run as sub-tests so a single failure does not abort the others.
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
	s := newIntegrationScraper(t)

	chapters := []scraper.ChapterRef{
		{
			Number: 1,
			Title:  "Chapter 1",
			URL:    integrationBookURL + "/chapter-1",
		},
		{
			Number: 2,
			Title:  "Chapter 2",
			URL:    integrationBookURL + "/chapter-2",
		},
		{
			Number: 3,
			Title:  "Chapter 3",
			URL:    integrationBookURL + "/chapter-3",
		},
	}

	for _, ref := range chapters {
		ref := ref // capture
		t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
			// Sequential: each chapter needs its own generous timeout.
			ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
			defer cancel()

			ch, err := s.ScrapeChapterText(ctx, ref)
			if err != nil {
				t.Fatalf("ScrapeChapterText failed: %v", err)
			}

			t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
			t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))

			// Ref fields must be echoed back unchanged.
			if ch.Ref.Number != ref.Number {
				t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
			}
			if ch.Ref.URL != ref.URL {
				t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
			}

			// Text must be non-trivially long.
			if len(ch.Text) < 100 {
				t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
					len(ch.Text), ch.Text)
			}

			// Text must not contain raw HTML tags — NodeToMarkdown should have
			// stripped them.
			for _, tag := range []string{"<div", "<span", "<script", "<style"} {
				if strings.Contains(ch.Text, tag) {
					t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
				}
			}
		})
	}
}

// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
// variant: it first calls ScrapeChapterList to get the real refs (with
// URL-derived numbers), then scrapes chapters 1–3 using those refs.
// This catches any discrepancy between the list and the chapter URLs.
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
	s := newIntegrationScraper(t)

	// Step 1: fetch the chapter list.
	listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
	defer listCancel()

	refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
	if err != nil {
		t.Fatalf("ScrapeChapterList failed: %v", err)
	}
	if len(refs) == 0 {
		t.Fatal("ScrapeChapterList returned 0 refs")
	}

	// Build a map number→ref for fast lookup.
	byNumber := make(map[int]scraper.ChapterRef, len(refs))
	for _, r := range refs {
		byNumber[r.Number] = r
	}

	// Step 2: scrape chapters 1, 2, 3.
	for _, wantNum := range []int{1, 2, 3} {
		wantNum := wantNum
		ref, ok := byNumber[wantNum]
		if !ok {
			t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
			continue
		}

		t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
			ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
			defer cancel()

			ch, err := s.ScrapeChapterText(ctx, ref)
			if err != nil {
				t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
			}

			t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
			t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))

			if len(ch.Text) < 100 {
				t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
			}
		})
	}
}

// ── helpers ───────────────────────────────────────────────────────────────────

func truncateStr(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}