Files
libnovel/scraper/internal/novelfire/scraper.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

666 lines
20 KiB
Go

// Package novelfire provides a NovelScraper implementation for novelfire.net.
//
// Site structure (as of 2025):
//
// Catalogue : https://novelfire.net/genre-all/sort-new/status-all/all-novel?page=N
// Book page : https://novelfire.net/book/{slug}
// Chapters : https://novelfire.net/book/{slug}/chapters?page=N
// Chapter : https://novelfire.net/book/{slug}/{chapter-slug}
package novelfire
import (
"context"
"fmt"
"log/slog"
"net/url"
"path"
"strconv"
"strings"
"time"
"github.com/libnovel/scraper/internal/browser"
"github.com/libnovel/scraper/internal/scraper"
"github.com/libnovel/scraper/internal/scraper/htmlutil"
)
const (
baseURL = "https://novelfire.net"
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
rankingPath = "/ranking"
)
// rejectResourceTypes lists Browserless resource types to block on every request.
// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
// Everything else is safe to drop for HTML-only scraping.
var rejectResourceTypes = []string{
"cspviolationreport",
"eventsource",
"fedcm",
"font",
"image",
"manifest",
"media",
"other",
"ping",
"signedexchange",
"stylesheet",
"texttrack",
"websocket",
}
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
// It uses the /content strategy by default (rendered HTML via Browserless).
type Scraper struct {
client browser.BrowserClient
urlClient browser.BrowserClient // separate client for URL retrieval (uses browserless content strategy)
log *slog.Logger
}
// New returns a new novelfire Scraper.
// client is used for content fetching, urlClient is used for URL retrieval (chapter list).
// If urlClient is nil, client will be used for both.
func New(client browser.BrowserClient, log *slog.Logger, urlClient browser.BrowserClient) *Scraper {
if log == nil {
log = slog.Default()
}
if urlClient == nil {
urlClient = client
}
return &Scraper{client: client, urlClient: urlClient, log: log}
}
// SourceName implements NovelScraper.
func (s *Scraper) SourceName() string { return "novelfire.net" }
// ─── CatalogueProvider ───────────────────────────────────────────────────────
// ScrapeCatalogue streams all CatalogueEntry values across all pages.
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
entries := make(chan scraper.CatalogueEntry, 64)
errs := make(chan error, 16)
go func() {
defer close(entries)
defer close(errs)
pageURL := baseURL + cataloguePath
page := 1
for pageURL != "" {
select {
case <-ctx.Done():
return
default:
}
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
s.log.Debug("catalogue page fetch starting",
"page", page,
"payload_url", pageURL,
"payload_wait_selector", ".novel-item",
"payload_wait_selector_timeout_ms", 5000,
)
html, err := s.client.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 5000},
RejectResourceTypes: rejectResourceTypes,
GotoOptions: &browser.GotoOptions{Timeout: 60000},
})
if err != nil {
s.log.Debug("catalogue page fetch failed",
"page", page,
"url", pageURL,
"err", err,
)
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
return
}
s.log.Debug("catalogue page fetch completed",
"page", page,
"url", pageURL,
"response_bytes", len(html),
)
root, err := htmlutil.ParseHTML(html)
if err != nil {
errs <- fmt.Errorf("catalogue page %d parse: %w", page, err)
return
}
// Extract novel cards: <div class="novel-item">
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
if len(cards) == 0 {
s.log.Warn("no novel cards found, stopping pagination", "page", page)
return
}
for _, card := range cards {
// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})
var title, href string
if titleNode != nil {
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a", Attr: "href"})
if linkNode != nil {
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
href = htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
}
}
if href == "" || title == "" {
continue
}
bookURL := resolveURL(baseURL, href)
select {
case <-ctx.Done():
return
case entries <- scraper.CatalogueEntry{Title: title, URL: bookURL}:
}
}
// Find next page link: <a class="next" href="...">
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
if nextHref == "" {
break
}
pageURL = resolveURL(baseURL, nextHref)
page++
}
}()
return entries, errs
}
// ─── MetadataProvider ────────────────────────────────────────────────────────
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
s.log.Debug("metadata fetch starting",
"payload_url", bookURL,
"payload_wait_selector", ".novel-title",
"payload_wait_selector_timeout_ms", 5000,
)
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
URL: bookURL,
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 5000},
RejectResourceTypes: rejectResourceTypes,
GotoOptions: &browser.GotoOptions{Timeout: 60000},
})
if err != nil {
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
}
s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
}
// <h1 class="novel-title">Title</h1>
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
// <span class="author"><a>Author Name</a></span>
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
// <figure class="cover"><img src="..."></figure>
var cover string
if figureCover := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); figureCover != nil {
cover = htmlutil.ExtractFirst(figureCover, scraper.Selector{Tag: "img", Attr: "src"})
}
// <span class="status">Ongoing</span>
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
// Genres: all <a> tags inside <div class="genres">
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
var genres []string
if genresNode != nil {
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
}
// <div class="summary"><p>...</p></div>
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
// <span class="chapter-count">123 Chapters</span>
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
totalChapters := parseChapterCount(totalStr)
slug := slugFromURL(bookURL)
meta := scraper.BookMeta{
Slug: slug,
Title: title,
Author: author,
Cover: cover,
Status: status,
Genres: genres,
Summary: summary,
TotalChapters: totalChapters,
SourceURL: bookURL,
}
s.log.Debug("metadata parsed",
"url", bookURL,
"slug", meta.Slug,
"title", meta.Title,
"author", meta.Author,
"status", meta.Status,
"genres", meta.Genres,
"total_chapters", meta.TotalChapters,
)
return meta, nil
}
// ─── ChapterListProvider ─────────────────────────────────────────────────────
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
var refs []scraper.ChapterRef
// Chapter list URL: {bookURL}/chapters?page=N
baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters"
page := 1
for {
select {
case <-ctx.Done():
return refs, ctx.Err()
default:
}
pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page)
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
s.log.Debug("chapter list fetch starting",
"page", page,
"payload_url", pageURL,
"payload_wait_selector", ".chapter-list",
"payload_wait_selector_timeout_ms", 15000,
"payload_wait_timeout_ms", 2000,
"strategy", s.urlClient.Strategy(),
)
raw, err := s.urlClient.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
// Wait up to 15 s for the chapter list container to appear in the DOM.
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 15000},
// After the selector is found, wait an additional 2 s for any
// deferred JS rendering (lazy-loaded links, infinite-scroll hydration).
WaitForTimeout: 2000,
RejectResourceTypes: rejectResourceTypes,
GotoOptions: &browser.GotoOptions{Timeout: 60000},
// Do NOT use BestAttempt — we want a complete page or a clear error,
// not silently partial HTML that looks like "no more chapters".
BestAttempt: false,
})
if err != nil {
s.log.Debug("chapter list fetch failed",
"page", page,
"url", pageURL,
"err", err,
)
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
}
s.log.Debug("chapter list fetch completed",
"page", page,
"url", pageURL,
"response_bytes", len(raw),
)
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
}
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
if chapterList == nil {
// No chapter list container on this page — we've gone past the last page.
s.log.Debug("chapter list container not found, stopping pagination", "page", page)
break
}
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
s.log.Debug("chapter list page parsed",
"page", page,
"url", pageURL,
"chapters_on_page", len(items),
"total_refs_so_far", len(refs),
)
// Zero items on this page means we've gone past the last page.
if len(items) == 0 {
s.log.Debug("no chapters on page, stopping pagination", "page", page)
break
}
for _, item := range items {
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
if linkNode == nil {
continue
}
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
if href == "" {
continue
}
chURL := resolveURL(baseURL, href)
num := chapterNumberFromURL(chURL)
if num <= 0 {
// Fall back to position if the URL has no parseable number.
num = len(refs) + 1
s.log.Warn("chapter number not parseable from URL, falling back to position",
"url", chURL,
"position", num,
)
}
refs = append(refs, scraper.ChapterRef{
Number: num,
Title: strings.TrimSpace(chTitle),
URL: chURL,
})
}
page++
}
return refs, nil
}
// ─── RankingProvider ───────────────────────────────────────────────────────────
func (s *Scraper) ScrapeRanking(ctx context.Context) (<-chan scraper.BookMeta, <-chan error) {
entries := make(chan scraper.BookMeta, 64)
errs := make(chan error, 16)
go func() {
defer close(entries)
defer close(errs)
pageURL := baseURL + rankingPath
rank := 1
for pageURL != "" {
select {
case <-ctx.Done():
return
default:
}
s.log.Info("scraping ranking page", "url", pageURL)
// Use WaitFor only for browser-based strategies
var raw string
var err error
if s.client.Strategy() == browser.StrategyDirect {
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
RejectResourceTypes: rejectResourceTypes,
})
} else {
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
WaitFor: &browser.WaitForSelector{Selector: ".rank-novels", Timeout: 30000},
RejectResourceTypes: rejectResourceTypes,
GotoOptions: &browser.GotoOptions{Timeout: 60000},
BestAttempt: true,
})
}
if err != nil {
s.log.Debug("ranking page fetch failed", "url", pageURL, "err", err)
errs <- fmt.Errorf("ranking page: %w", err)
return
}
root, err := htmlutil.ParseHTML(raw)
if err != nil {
errs <- fmt.Errorf("ranking page parse: %w", err)
return
}
rankList := htmlutil.FindFirst(root, scraper.Selector{Class: "rank-novels"})
if rankList == nil {
break
}
items := htmlutil.FindAll(rankList, scraper.Selector{Tag: "li", Class: "novel-item"})
for _, item := range items {
// Cover: <figure class="cover"><a href="/book/slug"><img data-src="..."></a></figure>
var cover string
if fig := htmlutil.FindFirst(item, scraper.Selector{Tag: "figure", Class: "cover"}); fig != nil {
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"})
if cover != "" {
cover = baseURL + cover
}
}
// Title and URL: <h2 class="title"><a href="/book/slug">Title</a></h2>
titleNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "h2", Class: "title"})
var title, bookURL string
if titleNode != nil {
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a"})
if linkNode != nil {
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
bookURL = resolveURL(baseURL, href)
}
}
// Status: <span class="status"> Ongoing/Completed </span>
status := htmlutil.ExtractFirst(item, scraper.Selector{Tag: "span", Class: "status"})
// Genres: <div class="categories"><div class="scroll"><span>Genre1</span><span>Genre2</span>...</div></div>
var genres []string
categoriesNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "div", Class: "categories"})
if categoriesNode != nil {
genres = htmlutil.ExtractAll(categoriesNode, scraper.Selector{Tag: "span", Multiple: true})
}
slug := slugFromURL(bookURL)
meta := scraper.BookMeta{
Slug: slug,
Title: title,
Cover: cover,
Status: strings.TrimSpace(status),
Genres: genres,
SourceURL: bookURL,
Ranking: rank,
}
rank++
select {
case <-ctx.Done():
return
case entries <- meta:
}
}
// Next page - ranking pages use different pagination, just get first page for now
break
}
}()
return entries, errs
}
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
// exponentially between retries. Only errors that look like transient Browserless
// failures (timeouts, 5xx responses) are retried; context cancellation and
// permanent errors are returned immediately.
func retryGetContent(
ctx context.Context,
log *slog.Logger,
client browser.BrowserClient,
req browser.ContentRequest,
maxAttempts int,
baseDelay time.Duration,
) (string, error) {
var lastErr error
delay := baseDelay
for attempt := 1; attempt <= maxAttempts; attempt++ {
html, err := client.GetContent(ctx, req)
if err == nil {
return html, nil
}
lastErr = err
// Stop immediately on context cancellation.
if ctx.Err() != nil {
return "", err
}
if attempt < maxAttempts {
log.Warn("chapter fetch failed, retrying",
"url", req.URL,
"attempt", attempt,
"max_attempts", maxAttempts,
"retry_in", delay,
"err", err,
)
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(delay):
}
delay *= 2
}
}
return "", lastErr
}
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
s.log.Debug("chapter text fetch starting",
"chapter", ref.Number,
"title", ref.Title,
"payload_url", ref.URL,
"payload_wait_selector", "#content",
"payload_wait_selector_timeout_ms", 5000,
)
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
URL: ref.URL,
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 5000},
RejectResourceTypes: rejectResourceTypes,
GotoOptions: &browser.GotoOptions{Timeout: 60000},
BestAttempt: true,
}, 9, 6*time.Second)
if err != nil {
s.log.Debug("chapter text fetch failed",
"chapter", ref.Number,
"url", ref.URL,
"err", err,
)
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
}
if len(raw) > 0 {
preview := raw
if len(preview) > 500 {
preview = preview[:500]
}
s.log.Debug("chapter text fetch partial content",
"chapter", ref.Number,
"url", ref.URL,
"response_bytes", len(raw),
"preview", preview,
)
}
s.log.Debug("chapter text fetch completed",
"chapter", ref.Number,
"url", ref.URL,
"response_bytes", len(raw),
)
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
}
// <div id="content">…</div>
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
if container == nil {
return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
}
text := htmlutil.NodeToMarkdown(container)
s.log.Debug("chapter text parsed",
"chapter", ref.Number,
"url", ref.URL,
"text_bytes", len(text),
)
return scraper.Chapter{
Ref: ref,
Text: text,
}, nil
}
// ─── helpers ─────────────────────────────────────────────────────────────────
func resolveURL(base, href string) string {
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
b, err := url.Parse(base)
if err != nil {
return base + href
}
ref, err := url.Parse(href)
if err != nil {
return base + href
}
return b.ResolveReference(ref).String()
}
func slugFromURL(bookURL string) string {
u, err := url.Parse(bookURL)
if err != nil {
return bookURL
}
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
if len(parts) >= 2 && parts[0] == "book" {
return parts[1]
}
if len(parts) > 0 {
return parts[len(parts)-1]
}
return ""
}
func parseChapterCount(s string) int {
// Formats: "123 Chapters", "1,234 Chapters", "123"
s = strings.ReplaceAll(s, ",", "")
fields := strings.Fields(s)
if len(fields) == 0 {
return 0
}
n, _ := strconv.Atoi(fields[0])
return n
}
// chapterNumberFromURL extracts the chapter number from a novelfire chapter URL.
//
// URL pattern: https://novelfire.net/book/{book-slug}/chapter-{N}
// The last path segment is expected to be "chapter-{N}" or "{N}".
// Returns 0 if no number can be parsed.
func chapterNumberFromURL(chapterURL string) int {
u, err := url.Parse(chapterURL)
if err != nil {
return 0
}
seg := path.Base(u.Path) // e.g. "chapter-42" or "42"
// Strip a "chapter-" prefix if present.
seg = strings.TrimPrefix(seg, "chapter-")
// Also handle "chap-", "ch-" variants used by some sites.
seg = strings.TrimPrefix(seg, "chap-")
seg = strings.TrimPrefix(seg, "ch-")
// Take only the leading digits (handles slugs like "42-title-text").
digits := strings.FieldsFunc(seg, func(r rune) bool {
return r < '0' || r > '9'
})
if len(digits) == 0 {
return 0
}
n, _ := strconv.Atoi(digits[0])
return n
}