libnovel/scraper/internal/novelfire/scraper_test.go

package novelfire

import (
	"context"
	"strings"
	"testing"

	"github.com/libnovel/scraper/internal/browser"
	"github.com/libnovel/scraper/internal/scraper"
)

// ── stub browser client ───────────────────────────────────────────────────────

// stubClient is a BrowserClient that returns a fixed HTML string for every
// GetContent call. ScrapePage and CDPSession are not used by these tests.
type stubClient struct {
	html string
}

func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent }

func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
	return s.html, nil
}

func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
	return browser.ScrapeResponse{}, nil
}

func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
	return nil
}

// pagedStubClient returns a different HTML response for each successive call.
// Once all pages are exhausted it returns an empty page (no chapter-list),
// simulating the paginated chapter-list endpoint terminating correctly.
type pagedStubClient struct {
	pages []string
	call  int
}

func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent }

func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
	if c.call < len(c.pages) {
		html := c.pages[c.call]
		c.call++
		return html, nil
	}
	// Past the last page — return a page with no chapter-list to stop pagination.
	return `<!DOCTYPE html><html><body><div class="no-content"></div></body></html>`, nil
}

func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
	return browser.ScrapeResponse{}, nil
}

func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
	return nil
}

// ── helpers ───────────────────────────────────────────────────────────────────

func newScraper(html string) *Scraper {
	return New(&stubClient{html: html}, nil, &stubClient{html: html})
}

func newPagedScraper(pages ...string) *Scraper {
	urlClient := &pagedStubClient{pages: pages}
	return New(&stubClient{}, nil, urlClient)
}

// ── ScrapeChapterText ─────────────────────────────────────────────────────────

func TestScrapeChapterText_ExtractsInnerText(t *testing.T) {
	html := `<!DOCTYPE html><html><body>
		<div id="content">
			<p>It was a dark and stormy night.</p>
			<p>The hero stepped forward.</p>
		</div>
	</body></html>`

	s := newScraper(html)
	ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"}

	ch, err := s.ScrapeChapterText(context.Background(), ref)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}
	if ch.Ref.Number != 1 {
		t.Errorf("expected chapter number 1, got %d", ch.Ref.Number)
	}
	if !strings.Contains(ch.Text, "dark and stormy") {
		t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text)
	}
	if !strings.Contains(ch.Text, "hero stepped forward") {
		t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text)
	}
}

func TestScrapeChapterText_MissingContainer(t *testing.T) {
	html := `<!DOCTYPE html><html><body><div class="other">nothing here</div></body></html>`

	s := newScraper(html)
	ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"}

	_, err := s.ScrapeChapterText(context.Background(), ref)
	if err == nil {
		t.Fatal("expected an error when #content container is missing, got nil")
	}
}

// ── chapterNumberFromURL ──────────────────────────────────────────────────────

func TestChapterNumberFromURL(t *testing.T) {
	cases := []struct {
		url  string
		want int
	}{
		// Standard novelfire pattern.
		{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1},
		{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26},
		{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58},
		// Large chapter numbers.
		{"https://novelfire.net/book/some-novel/chapter-1000", 1000},
		// Path segment with trailing slash.
		{"https://novelfire.net/book/some-novel/chapter-5/", 5},
		// Slug with title appended after the number (hypothetical future format).
		{"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42},
		// Unparseable — should return 0 so the caller can fall back.
		{"https://novelfire.net/book/some-novel/prologue", 0},
		{"https://novelfire.net/book/some-novel/", 0},
		{"not-a-url", 0},
	}

	for _, tc := range cases {
		got := chapterNumberFromURL(tc.url)
		if got != tc.want {
			t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want)
		}
	}
}

// ── ScrapeChapterList (position vs URL numbering) ─────────────────────────────

// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML
// is served newest-first (as novelfire.net does), chapter numbers are still
// assigned from the URL — not from list position — so that a re-run correctly
// identifies which chapters are already on disk.
func TestScrapeChapterList_NumbersFromURL(t *testing.T) {
	// Simulate a newest-first chapter list with 5 chapters on a single page.
	// Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML.
	page1 := `<!DOCTYPE html><html><body>
		<ul class="chapter-list">
			<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
		</ul>
	</body></html>`

	s := newPagedScraper(page1)
	refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}
	if len(refs) != 5 {
		t.Fatalf("expected 5 refs, got %d", len(refs))
	}

	// With position-based numbering (the old bug), refs[0].Number would be 1
	// even though its URL is /chapter-5. With URL-based numbering it must be 5.
	wantNumbers := []int{5, 4, 3, 2, 1}
	for i, ref := range refs {
		if ref.Number != wantNumbers[i] {
			t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
		}
	}
}

// TestScrapeChapterList_Pagination verifies that the scraper correctly follows
// ?page=N pagination and stops when a page returns no chapter items.
func TestScrapeChapterList_Pagination(t *testing.T) {
	page1 := `<!DOCTYPE html><html><body>
		<ul class="chapter-list">
			<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
		</ul>
	</body></html>`

	page2 := `<!DOCTYPE html><html><body>
		<ul class="chapter-list">
			<li class="chapter-item"><a href="/book/test/chapter-6">Chapter 6</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
			<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
		</ul>
	</body></html>`

	// page3 is omitted — pagedStubClient will return empty page to stop pagination.
	s := newPagedScraper(page1, page2)
	refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}
	if len(refs) != 6 {
		t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs))
	}

	wantNumbers := []int{3, 2, 1, 6, 5, 4}
	for i, ref := range refs {
		if ref.Number != wantNumbers[i] {
			t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
		}
	}
}