Files
libnovel/scraper/internal/novelfire/scraper_test.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

218 lines
8.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package novelfire
import (
"context"
"strings"
"testing"
"github.com/libnovel/scraper/internal/browser"
"github.com/libnovel/scraper/internal/scraper"
)
// ── stub browser client ───────────────────────────────────────────────────────
// stubClient is a BrowserClient that returns a fixed HTML string for every
// GetContent call. ScrapePage and CDPSession are not used by these tests.
type stubClient struct {
html string
}
func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent }
func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
return s.html, nil
}
func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
return browser.ScrapeResponse{}, nil
}
func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
return nil
}
// pagedStubClient returns a different HTML response for each successive call.
// Once all pages are exhausted it returns an empty page (no chapter-list),
// simulating the paginated chapter-list endpoint terminating correctly.
type pagedStubClient struct {
pages []string
call int
}
func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent }
func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
if c.call < len(c.pages) {
html := c.pages[c.call]
c.call++
return html, nil
}
// Past the last page — return a page with no chapter-list to stop pagination.
return `<!DOCTYPE html><html><body><div class="no-content"></div></body></html>`, nil
}
func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
return browser.ScrapeResponse{}, nil
}
func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
return nil
}
// ── helpers ───────────────────────────────────────────────────────────────────
func newScraper(html string) *Scraper {
return New(&stubClient{html: html}, nil, &stubClient{html: html})
}
func newPagedScraper(pages ...string) *Scraper {
urlClient := &pagedStubClient{pages: pages}
return New(&stubClient{}, nil, urlClient)
}
// ── ScrapeChapterText ─────────────────────────────────────────────────────────
func TestScrapeChapterText_ExtractsInnerText(t *testing.T) {
html := `<!DOCTYPE html><html><body>
<div id="content">
<p>It was a dark and stormy night.</p>
<p>The hero stepped forward.</p>
</div>
</body></html>`
s := newScraper(html)
ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"}
ch, err := s.ScrapeChapterText(context.Background(), ref)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ch.Ref.Number != 1 {
t.Errorf("expected chapter number 1, got %d", ch.Ref.Number)
}
if !strings.Contains(ch.Text, "dark and stormy") {
t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text)
}
if !strings.Contains(ch.Text, "hero stepped forward") {
t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text)
}
}
func TestScrapeChapterText_MissingContainer(t *testing.T) {
html := `<!DOCTYPE html><html><body><div class="other">nothing here</div></body></html>`
s := newScraper(html)
ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"}
_, err := s.ScrapeChapterText(context.Background(), ref)
if err == nil {
t.Fatal("expected an error when #content container is missing, got nil")
}
}
// ── chapterNumberFromURL ──────────────────────────────────────────────────────
func TestChapterNumberFromURL(t *testing.T) {
cases := []struct {
url string
want int
}{
// Standard novelfire pattern.
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1},
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26},
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58},
// Large chapter numbers.
{"https://novelfire.net/book/some-novel/chapter-1000", 1000},
// Path segment with trailing slash.
{"https://novelfire.net/book/some-novel/chapter-5/", 5},
// Slug with title appended after the number (hypothetical future format).
{"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42},
// Unparseable — should return 0 so the caller can fall back.
{"https://novelfire.net/book/some-novel/prologue", 0},
{"https://novelfire.net/book/some-novel/", 0},
{"not-a-url", 0},
}
for _, tc := range cases {
got := chapterNumberFromURL(tc.url)
if got != tc.want {
t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want)
}
}
}
// ── ScrapeChapterList (position vs URL numbering) ─────────────────────────────
// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML
// is served newest-first (as novelfire.net does), chapter numbers are still
// assigned from the URL — not from list position — so that a re-run correctly
// identifies which chapters are already on disk.
func TestScrapeChapterList_NumbersFromURL(t *testing.T) {
// Simulate a newest-first chapter list with 5 chapters on a single page.
// Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML.
page1 := `<!DOCTYPE html><html><body>
<ul class="chapter-list">
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
</ul>
</body></html>`
s := newPagedScraper(page1)
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(refs) != 5 {
t.Fatalf("expected 5 refs, got %d", len(refs))
}
// With position-based numbering (the old bug), refs[0].Number would be 1
// even though its URL is /chapter-5. With URL-based numbering it must be 5.
wantNumbers := []int{5, 4, 3, 2, 1}
for i, ref := range refs {
if ref.Number != wantNumbers[i] {
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
}
}
}
// TestScrapeChapterList_Pagination verifies that the scraper correctly follows
// ?page=N pagination and stops when a page returns no chapter items.
func TestScrapeChapterList_Pagination(t *testing.T) {
page1 := `<!DOCTYPE html><html><body>
<ul class="chapter-list">
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
</ul>
</body></html>`
page2 := `<!DOCTYPE html><html><body>
<ul class="chapter-list">
<li class="chapter-item"><a href="/book/test/chapter-6">Chapter 6</a></li>
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
</ul>
</body></html>`
// page3 is omitted — pagedStubClient will return empty page to stop pagination.
s := newPagedScraper(page1, page2)
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(refs) != 6 {
t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs))
}
wantNumbers := []int{3, 2, 1, 6, 5, 4}
for i, ref := range refs {
if ref.Number != wantNumbers[i] {
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
}
}
}