- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
218 lines
8.2 KiB
Go
218 lines
8.2 KiB
Go
package novelfire
|
||
|
||
import (
|
||
"context"
|
||
"strings"
|
||
"testing"
|
||
|
||
"github.com/libnovel/scraper/internal/browser"
|
||
"github.com/libnovel/scraper/internal/scraper"
|
||
)
|
||
|
||
// ── stub browser client ───────────────────────────────────────────────────────
|
||
|
||
// stubClient is a BrowserClient that returns a fixed HTML string for every
|
||
// GetContent call. ScrapePage and CDPSession are not used by these tests.
|
||
type stubClient struct {
|
||
html string
|
||
}
|
||
|
||
func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||
|
||
func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||
return s.html, nil
|
||
}
|
||
|
||
func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||
return browser.ScrapeResponse{}, nil
|
||
}
|
||
|
||
func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||
return nil
|
||
}
|
||
|
||
// pagedStubClient returns a different HTML response for each successive call.
|
||
// Once all pages are exhausted it returns an empty page (no chapter-list),
|
||
// simulating the paginated chapter-list endpoint terminating correctly.
|
||
type pagedStubClient struct {
|
||
pages []string
|
||
call int
|
||
}
|
||
|
||
func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||
|
||
func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||
if c.call < len(c.pages) {
|
||
html := c.pages[c.call]
|
||
c.call++
|
||
return html, nil
|
||
}
|
||
// Past the last page — return a page with no chapter-list to stop pagination.
|
||
return `<!DOCTYPE html><html><body><div class="no-content"></div></body></html>`, nil
|
||
}
|
||
|
||
func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||
return browser.ScrapeResponse{}, nil
|
||
}
|
||
|
||
func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||
return nil
|
||
}
|
||
|
||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
func newScraper(html string) *Scraper {
|
||
return New(&stubClient{html: html}, nil, &stubClient{html: html})
|
||
}
|
||
|
||
func newPagedScraper(pages ...string) *Scraper {
|
||
urlClient := &pagedStubClient{pages: pages}
|
||
return New(&stubClient{}, nil, urlClient)
|
||
}
|
||
|
||
// ── ScrapeChapterText ─────────────────────────────────────────────────────────
|
||
|
||
func TestScrapeChapterText_ExtractsInnerText(t *testing.T) {
|
||
html := `<!DOCTYPE html><html><body>
|
||
<div id="content">
|
||
<p>It was a dark and stormy night.</p>
|
||
<p>The hero stepped forward.</p>
|
||
</div>
|
||
</body></html>`
|
||
|
||
s := newScraper(html)
|
||
ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"}
|
||
|
||
ch, err := s.ScrapeChapterText(context.Background(), ref)
|
||
if err != nil {
|
||
t.Fatalf("unexpected error: %v", err)
|
||
}
|
||
if ch.Ref.Number != 1 {
|
||
t.Errorf("expected chapter number 1, got %d", ch.Ref.Number)
|
||
}
|
||
if !strings.Contains(ch.Text, "dark and stormy") {
|
||
t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text)
|
||
}
|
||
if !strings.Contains(ch.Text, "hero stepped forward") {
|
||
t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text)
|
||
}
|
||
}
|
||
|
||
func TestScrapeChapterText_MissingContainer(t *testing.T) {
|
||
html := `<!DOCTYPE html><html><body><div class="other">nothing here</div></body></html>`
|
||
|
||
s := newScraper(html)
|
||
ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"}
|
||
|
||
_, err := s.ScrapeChapterText(context.Background(), ref)
|
||
if err == nil {
|
||
t.Fatal("expected an error when #content container is missing, got nil")
|
||
}
|
||
}
|
||
|
||
// ── chapterNumberFromURL ──────────────────────────────────────────────────────
|
||
|
||
func TestChapterNumberFromURL(t *testing.T) {
|
||
cases := []struct {
|
||
url string
|
||
want int
|
||
}{
|
||
// Standard novelfire pattern.
|
||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1},
|
||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26},
|
||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58},
|
||
// Large chapter numbers.
|
||
{"https://novelfire.net/book/some-novel/chapter-1000", 1000},
|
||
// Path segment with trailing slash.
|
||
{"https://novelfire.net/book/some-novel/chapter-5/", 5},
|
||
// Slug with title appended after the number (hypothetical future format).
|
||
{"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42},
|
||
// Unparseable — should return 0 so the caller can fall back.
|
||
{"https://novelfire.net/book/some-novel/prologue", 0},
|
||
{"https://novelfire.net/book/some-novel/", 0},
|
||
{"not-a-url", 0},
|
||
}
|
||
|
||
for _, tc := range cases {
|
||
got := chapterNumberFromURL(tc.url)
|
||
if got != tc.want {
|
||
t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── ScrapeChapterList (position vs URL numbering) ─────────────────────────────
|
||
|
||
// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML
|
||
// is served newest-first (as novelfire.net does), chapter numbers are still
|
||
// assigned from the URL — not from list position — so that a re-run correctly
|
||
// identifies which chapters are already on disk.
|
||
func TestScrapeChapterList_NumbersFromURL(t *testing.T) {
|
||
// Simulate a newest-first chapter list with 5 chapters on a single page.
|
||
// Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML.
|
||
page1 := `<!DOCTYPE html><html><body>
|
||
<ul class="chapter-list">
|
||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||
</ul>
|
||
</body></html>`
|
||
|
||
s := newPagedScraper(page1)
|
||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||
if err != nil {
|
||
t.Fatalf("unexpected error: %v", err)
|
||
}
|
||
if len(refs) != 5 {
|
||
t.Fatalf("expected 5 refs, got %d", len(refs))
|
||
}
|
||
|
||
// With position-based numbering (the old bug), refs[0].Number would be 1
|
||
// even though its URL is /chapter-5. With URL-based numbering it must be 5.
|
||
wantNumbers := []int{5, 4, 3, 2, 1}
|
||
for i, ref := range refs {
|
||
if ref.Number != wantNumbers[i] {
|
||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestScrapeChapterList_Pagination verifies that the scraper correctly follows
|
||
// ?page=N pagination and stops when a page returns no chapter items.
|
||
func TestScrapeChapterList_Pagination(t *testing.T) {
|
||
page1 := `<!DOCTYPE html><html><body>
|
||
<ul class="chapter-list">
|
||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||
</ul>
|
||
</body></html>`
|
||
|
||
page2 := `<!DOCTYPE html><html><body>
|
||
<ul class="chapter-list">
|
||
<li class="chapter-item"><a href="/book/test/chapter-6">Chapter 6</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||
</ul>
|
||
</body></html>`
|
||
|
||
// page3 is omitted — pagedStubClient will return empty page to stop pagination.
|
||
s := newPagedScraper(page1, page2)
|
||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||
if err != nil {
|
||
t.Fatalf("unexpected error: %v", err)
|
||
}
|
||
if len(refs) != 6 {
|
||
t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs))
|
||
}
|
||
|
||
wantNumbers := []int{3, 2, 1, 6, 5, 4}
|
||
for i, ref := range refs {
|
||
if ref.Number != wantNumbers[i] {
|
||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||
}
|
||
}
|
||
}
|