- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
666 lines
20 KiB
Go
666 lines
20 KiB
Go
// Package novelfire provides a NovelScraper implementation for novelfire.net.
|
|
//
|
|
// Site structure (as of 2025):
|
|
//
|
|
// Catalogue : https://novelfire.net/genre-all/sort-new/status-all/all-novel?page=N
|
|
// Book page : https://novelfire.net/book/{slug}
|
|
// Chapters : https://novelfire.net/book/{slug}/chapters?page=N
|
|
// Chapter : https://novelfire.net/book/{slug}/{chapter-slug}
|
|
package novelfire
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/url"
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/libnovel/scraper/internal/browser"
|
|
"github.com/libnovel/scraper/internal/scraper"
|
|
"github.com/libnovel/scraper/internal/scraper/htmlutil"
|
|
)
|
|
|
|
const (
|
|
baseURL = "https://novelfire.net"
|
|
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
|
|
rankingPath = "/ranking"
|
|
)
|
|
|
|
// rejectResourceTypes lists Browserless resource types to block on every request.
|
|
// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
|
|
// Everything else is safe to drop for HTML-only scraping.
|
|
var rejectResourceTypes = []string{
|
|
"cspviolationreport",
|
|
"eventsource",
|
|
"fedcm",
|
|
"font",
|
|
"image",
|
|
"manifest",
|
|
"media",
|
|
"other",
|
|
"ping",
|
|
"signedexchange",
|
|
"stylesheet",
|
|
"texttrack",
|
|
"websocket",
|
|
}
|
|
|
|
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
|
|
// It uses the /content strategy by default (rendered HTML via Browserless).
|
|
type Scraper struct {
|
|
client browser.BrowserClient
|
|
urlClient browser.BrowserClient // separate client for URL retrieval (uses browserless content strategy)
|
|
log *slog.Logger
|
|
}
|
|
|
|
// New returns a new novelfire Scraper.
|
|
// client is used for content fetching, urlClient is used for URL retrieval (chapter list).
|
|
// If urlClient is nil, client will be used for both.
|
|
func New(client browser.BrowserClient, log *slog.Logger, urlClient browser.BrowserClient) *Scraper {
|
|
if log == nil {
|
|
log = slog.Default()
|
|
}
|
|
if urlClient == nil {
|
|
urlClient = client
|
|
}
|
|
return &Scraper{client: client, urlClient: urlClient, log: log}
|
|
}
|
|
|
|
// SourceName implements NovelScraper.
|
|
func (s *Scraper) SourceName() string { return "novelfire.net" }
|
|
|
|
// ─── CatalogueProvider ───────────────────────────────────────────────────────
|
|
|
|
// ScrapeCatalogue streams all CatalogueEntry values across all pages.
|
|
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
|
|
entries := make(chan scraper.CatalogueEntry, 64)
|
|
errs := make(chan error, 16)
|
|
|
|
go func() {
|
|
defer close(entries)
|
|
defer close(errs)
|
|
|
|
pageURL := baseURL + cataloguePath
|
|
page := 1
|
|
|
|
for pageURL != "" {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
|
|
s.log.Debug("catalogue page fetch starting",
|
|
"page", page,
|
|
"payload_url", pageURL,
|
|
"payload_wait_selector", ".novel-item",
|
|
"payload_wait_selector_timeout_ms", 5000,
|
|
)
|
|
|
|
html, err := s.client.GetContent(ctx, browser.ContentRequest{
|
|
URL: pageURL,
|
|
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 5000},
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
|
})
|
|
if err != nil {
|
|
s.log.Debug("catalogue page fetch failed",
|
|
"page", page,
|
|
"url", pageURL,
|
|
"err", err,
|
|
)
|
|
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
|
|
return
|
|
}
|
|
s.log.Debug("catalogue page fetch completed",
|
|
"page", page,
|
|
"url", pageURL,
|
|
"response_bytes", len(html),
|
|
)
|
|
|
|
root, err := htmlutil.ParseHTML(html)
|
|
if err != nil {
|
|
errs <- fmt.Errorf("catalogue page %d parse: %w", page, err)
|
|
return
|
|
}
|
|
|
|
// Extract novel cards: <div class="novel-item">
|
|
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
|
|
if len(cards) == 0 {
|
|
s.log.Warn("no novel cards found, stopping pagination", "page", page)
|
|
return
|
|
}
|
|
|
|
for _, card := range cards {
|
|
// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
|
|
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})
|
|
|
|
var title, href string
|
|
if titleNode != nil {
|
|
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a", Attr: "href"})
|
|
if linkNode != nil {
|
|
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
|
|
href = htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
|
}
|
|
}
|
|
if href == "" || title == "" {
|
|
continue
|
|
}
|
|
|
|
bookURL := resolveURL(baseURL, href)
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case entries <- scraper.CatalogueEntry{Title: title, URL: bookURL}:
|
|
}
|
|
}
|
|
|
|
// Find next page link: <a class="next" href="...">
|
|
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
|
|
if nextHref == "" {
|
|
break
|
|
}
|
|
pageURL = resolveURL(baseURL, nextHref)
|
|
page++
|
|
}
|
|
}()
|
|
|
|
return entries, errs
|
|
}
|
|
|
|
// ─── MetadataProvider ────────────────────────────────────────────────────────
|
|
|
|
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
|
|
s.log.Debug("metadata fetch starting",
|
|
"payload_url", bookURL,
|
|
"payload_wait_selector", ".novel-title",
|
|
"payload_wait_selector_timeout_ms", 5000,
|
|
)
|
|
|
|
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
|
URL: bookURL,
|
|
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 5000},
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
|
})
|
|
if err != nil {
|
|
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
|
|
return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
|
|
}
|
|
s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))
|
|
|
|
root, err := htmlutil.ParseHTML(raw)
|
|
if err != nil {
|
|
return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
|
|
}
|
|
|
|
// <h1 class="novel-title">Title</h1>
|
|
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
|
|
// <span class="author"><a>Author Name</a></span>
|
|
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
|
|
// <figure class="cover"><img src="..."></figure>
|
|
var cover string
|
|
if figureCover := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); figureCover != nil {
|
|
cover = htmlutil.ExtractFirst(figureCover, scraper.Selector{Tag: "img", Attr: "src"})
|
|
}
|
|
// <span class="status">Ongoing</span>
|
|
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
|
|
|
|
// Genres: all <a> tags inside <div class="genres">
|
|
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
|
|
var genres []string
|
|
if genresNode != nil {
|
|
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
|
|
}
|
|
|
|
// <div class="summary"><p>...</p></div>
|
|
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
|
|
// <span class="chapter-count">123 Chapters</span>
|
|
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
|
|
totalChapters := parseChapterCount(totalStr)
|
|
|
|
slug := slugFromURL(bookURL)
|
|
|
|
meta := scraper.BookMeta{
|
|
Slug: slug,
|
|
Title: title,
|
|
Author: author,
|
|
Cover: cover,
|
|
Status: status,
|
|
Genres: genres,
|
|
Summary: summary,
|
|
TotalChapters: totalChapters,
|
|
SourceURL: bookURL,
|
|
}
|
|
s.log.Debug("metadata parsed",
|
|
"url", bookURL,
|
|
"slug", meta.Slug,
|
|
"title", meta.Title,
|
|
"author", meta.Author,
|
|
"status", meta.Status,
|
|
"genres", meta.Genres,
|
|
"total_chapters", meta.TotalChapters,
|
|
)
|
|
return meta, nil
|
|
}
|
|
|
|
// ─── ChapterListProvider ─────────────────────────────────────────────────────
|
|
|
|
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
|
|
var refs []scraper.ChapterRef
|
|
// Chapter list URL: {bookURL}/chapters?page=N
|
|
baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
|
page := 1
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return refs, ctx.Err()
|
|
default:
|
|
}
|
|
|
|
pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page)
|
|
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
|
|
|
|
s.log.Debug("chapter list fetch starting",
|
|
"page", page,
|
|
"payload_url", pageURL,
|
|
"payload_wait_selector", ".chapter-list",
|
|
"payload_wait_selector_timeout_ms", 15000,
|
|
"payload_wait_timeout_ms", 2000,
|
|
"strategy", s.urlClient.Strategy(),
|
|
)
|
|
|
|
raw, err := s.urlClient.GetContent(ctx, browser.ContentRequest{
|
|
URL: pageURL,
|
|
// Wait up to 15 s for the chapter list container to appear in the DOM.
|
|
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 15000},
|
|
// After the selector is found, wait an additional 2 s for any
|
|
// deferred JS rendering (lazy-loaded links, infinite-scroll hydration).
|
|
WaitForTimeout: 2000,
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
|
// Do NOT use BestAttempt — we want a complete page or a clear error,
|
|
// not silently partial HTML that looks like "no more chapters".
|
|
BestAttempt: false,
|
|
})
|
|
if err != nil {
|
|
s.log.Debug("chapter list fetch failed",
|
|
"page", page,
|
|
"url", pageURL,
|
|
"err", err,
|
|
)
|
|
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
|
|
}
|
|
s.log.Debug("chapter list fetch completed",
|
|
"page", page,
|
|
"url", pageURL,
|
|
"response_bytes", len(raw),
|
|
)
|
|
|
|
root, err := htmlutil.ParseHTML(raw)
|
|
if err != nil {
|
|
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
|
|
}
|
|
|
|
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
|
|
if chapterList == nil {
|
|
// No chapter list container on this page — we've gone past the last page.
|
|
s.log.Debug("chapter list container not found, stopping pagination", "page", page)
|
|
break
|
|
}
|
|
|
|
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
|
|
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
|
|
|
|
s.log.Debug("chapter list page parsed",
|
|
"page", page,
|
|
"url", pageURL,
|
|
"chapters_on_page", len(items),
|
|
"total_refs_so_far", len(refs),
|
|
)
|
|
|
|
// Zero items on this page means we've gone past the last page.
|
|
if len(items) == 0 {
|
|
s.log.Debug("no chapters on page, stopping pagination", "page", page)
|
|
break
|
|
}
|
|
|
|
for _, item := range items {
|
|
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
|
|
if linkNode == nil {
|
|
continue
|
|
}
|
|
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
|
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
|
|
if href == "" {
|
|
continue
|
|
}
|
|
chURL := resolveURL(baseURL, href)
|
|
num := chapterNumberFromURL(chURL)
|
|
if num <= 0 {
|
|
// Fall back to position if the URL has no parseable number.
|
|
num = len(refs) + 1
|
|
s.log.Warn("chapter number not parseable from URL, falling back to position",
|
|
"url", chURL,
|
|
"position", num,
|
|
)
|
|
}
|
|
refs = append(refs, scraper.ChapterRef{
|
|
Number: num,
|
|
Title: strings.TrimSpace(chTitle),
|
|
URL: chURL,
|
|
})
|
|
}
|
|
|
|
page++
|
|
}
|
|
|
|
return refs, nil
|
|
}
|
|
|
|
// ─── RankingProvider ───────────────────────────────────────────────────────────
|
|
|
|
func (s *Scraper) ScrapeRanking(ctx context.Context) (<-chan scraper.BookMeta, <-chan error) {
|
|
entries := make(chan scraper.BookMeta, 64)
|
|
errs := make(chan error, 16)
|
|
|
|
go func() {
|
|
defer close(entries)
|
|
defer close(errs)
|
|
|
|
pageURL := baseURL + rankingPath
|
|
rank := 1
|
|
|
|
for pageURL != "" {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
s.log.Info("scraping ranking page", "url", pageURL)
|
|
|
|
// Use WaitFor only for browser-based strategies
|
|
var raw string
|
|
var err error
|
|
if s.client.Strategy() == browser.StrategyDirect {
|
|
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
|
|
URL: pageURL,
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
})
|
|
} else {
|
|
raw, err = s.client.GetContent(ctx, browser.ContentRequest{
|
|
URL: pageURL,
|
|
WaitFor: &browser.WaitForSelector{Selector: ".rank-novels", Timeout: 30000},
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
|
BestAttempt: true,
|
|
})
|
|
}
|
|
if err != nil {
|
|
s.log.Debug("ranking page fetch failed", "url", pageURL, "err", err)
|
|
errs <- fmt.Errorf("ranking page: %w", err)
|
|
return
|
|
}
|
|
|
|
root, err := htmlutil.ParseHTML(raw)
|
|
if err != nil {
|
|
errs <- fmt.Errorf("ranking page parse: %w", err)
|
|
return
|
|
}
|
|
|
|
rankList := htmlutil.FindFirst(root, scraper.Selector{Class: "rank-novels"})
|
|
if rankList == nil {
|
|
break
|
|
}
|
|
|
|
items := htmlutil.FindAll(rankList, scraper.Selector{Tag: "li", Class: "novel-item"})
|
|
for _, item := range items {
|
|
// Cover: <figure class="cover"><a href="/book/slug"><img data-src="..."></a></figure>
|
|
var cover string
|
|
if fig := htmlutil.FindFirst(item, scraper.Selector{Tag: "figure", Class: "cover"}); fig != nil {
|
|
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"})
|
|
if cover != "" {
|
|
cover = baseURL + cover
|
|
}
|
|
}
|
|
|
|
// Title and URL: <h2 class="title"><a href="/book/slug">Title</a></h2>
|
|
titleNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "h2", Class: "title"})
|
|
var title, bookURL string
|
|
if titleNode != nil {
|
|
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a"})
|
|
if linkNode != nil {
|
|
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
|
|
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
|
bookURL = resolveURL(baseURL, href)
|
|
}
|
|
}
|
|
|
|
// Status: <span class="status"> Ongoing/Completed </span>
|
|
status := htmlutil.ExtractFirst(item, scraper.Selector{Tag: "span", Class: "status"})
|
|
|
|
// Genres: <div class="categories"><div class="scroll"><span>Genre1</span><span>Genre2</span>...</div></div>
|
|
var genres []string
|
|
categoriesNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "div", Class: "categories"})
|
|
if categoriesNode != nil {
|
|
genres = htmlutil.ExtractAll(categoriesNode, scraper.Selector{Tag: "span", Multiple: true})
|
|
}
|
|
|
|
slug := slugFromURL(bookURL)
|
|
|
|
meta := scraper.BookMeta{
|
|
Slug: slug,
|
|
Title: title,
|
|
Cover: cover,
|
|
Status: strings.TrimSpace(status),
|
|
Genres: genres,
|
|
SourceURL: bookURL,
|
|
Ranking: rank,
|
|
}
|
|
rank++
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case entries <- meta:
|
|
}
|
|
}
|
|
|
|
// Next page - ranking pages use different pagination, just get first page for now
|
|
break
|
|
}
|
|
}()
|
|
|
|
return entries, errs
|
|
}
|
|
|
|
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
|
|
|
|
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
|
|
// exponentially between retries. Only errors that look like transient Browserless
|
|
// failures (timeouts, 5xx responses) are retried; context cancellation and
|
|
// permanent errors are returned immediately.
|
|
func retryGetContent(
|
|
ctx context.Context,
|
|
log *slog.Logger,
|
|
client browser.BrowserClient,
|
|
req browser.ContentRequest,
|
|
maxAttempts int,
|
|
baseDelay time.Duration,
|
|
) (string, error) {
|
|
var lastErr error
|
|
delay := baseDelay
|
|
for attempt := 1; attempt <= maxAttempts; attempt++ {
|
|
html, err := client.GetContent(ctx, req)
|
|
if err == nil {
|
|
return html, nil
|
|
}
|
|
lastErr = err
|
|
|
|
// Stop immediately on context cancellation.
|
|
if ctx.Err() != nil {
|
|
return "", err
|
|
}
|
|
|
|
if attempt < maxAttempts {
|
|
log.Warn("chapter fetch failed, retrying",
|
|
"url", req.URL,
|
|
"attempt", attempt,
|
|
"max_attempts", maxAttempts,
|
|
"retry_in", delay,
|
|
"err", err,
|
|
)
|
|
select {
|
|
case <-ctx.Done():
|
|
return "", ctx.Err()
|
|
case <-time.After(delay):
|
|
}
|
|
delay *= 2
|
|
}
|
|
}
|
|
return "", lastErr
|
|
}
|
|
|
|
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
|
|
s.log.Debug("chapter text fetch starting",
|
|
"chapter", ref.Number,
|
|
"title", ref.Title,
|
|
"payload_url", ref.URL,
|
|
"payload_wait_selector", "#content",
|
|
"payload_wait_selector_timeout_ms", 5000,
|
|
)
|
|
|
|
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
|
|
URL: ref.URL,
|
|
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 5000},
|
|
RejectResourceTypes: rejectResourceTypes,
|
|
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
|
BestAttempt: true,
|
|
}, 9, 6*time.Second)
|
|
if err != nil {
|
|
s.log.Debug("chapter text fetch failed",
|
|
"chapter", ref.Number,
|
|
"url", ref.URL,
|
|
"err", err,
|
|
)
|
|
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
|
|
}
|
|
if len(raw) > 0 {
|
|
preview := raw
|
|
if len(preview) > 500 {
|
|
preview = preview[:500]
|
|
}
|
|
s.log.Debug("chapter text fetch partial content",
|
|
"chapter", ref.Number,
|
|
"url", ref.URL,
|
|
"response_bytes", len(raw),
|
|
"preview", preview,
|
|
)
|
|
}
|
|
s.log.Debug("chapter text fetch completed",
|
|
"chapter", ref.Number,
|
|
"url", ref.URL,
|
|
"response_bytes", len(raw),
|
|
)
|
|
|
|
root, err := htmlutil.ParseHTML(raw)
|
|
if err != nil {
|
|
return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
|
|
}
|
|
|
|
// <div id="content">…</div>
|
|
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
|
|
if container == nil {
|
|
return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
|
|
}
|
|
|
|
text := htmlutil.NodeToMarkdown(container)
|
|
|
|
s.log.Debug("chapter text parsed",
|
|
"chapter", ref.Number,
|
|
"url", ref.URL,
|
|
"text_bytes", len(text),
|
|
)
|
|
|
|
return scraper.Chapter{
|
|
Ref: ref,
|
|
Text: text,
|
|
}, nil
|
|
}
|
|
|
|
// ─── helpers ─────────────────────────────────────────────────────────────────
|
|
|
|
func resolveURL(base, href string) string {
|
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
|
return href
|
|
}
|
|
b, err := url.Parse(base)
|
|
if err != nil {
|
|
return base + href
|
|
}
|
|
ref, err := url.Parse(href)
|
|
if err != nil {
|
|
return base + href
|
|
}
|
|
return b.ResolveReference(ref).String()
|
|
}
|
|
|
|
func slugFromURL(bookURL string) string {
|
|
u, err := url.Parse(bookURL)
|
|
if err != nil {
|
|
return bookURL
|
|
}
|
|
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
|
if len(parts) >= 2 && parts[0] == "book" {
|
|
return parts[1]
|
|
}
|
|
if len(parts) > 0 {
|
|
return parts[len(parts)-1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func parseChapterCount(s string) int {
|
|
// Formats: "123 Chapters", "1,234 Chapters", "123"
|
|
s = strings.ReplaceAll(s, ",", "")
|
|
fields := strings.Fields(s)
|
|
if len(fields) == 0 {
|
|
return 0
|
|
}
|
|
n, _ := strconv.Atoi(fields[0])
|
|
return n
|
|
}
|
|
|
|
// chapterNumberFromURL extracts the chapter number from a novelfire chapter URL.
|
|
//
|
|
// URL pattern: https://novelfire.net/book/{book-slug}/chapter-{N}
|
|
// The last path segment is expected to be "chapter-{N}" or "{N}".
|
|
// Returns 0 if no number can be parsed.
|
|
func chapterNumberFromURL(chapterURL string) int {
|
|
u, err := url.Parse(chapterURL)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
seg := path.Base(u.Path) // e.g. "chapter-42" or "42"
|
|
// Strip a "chapter-" prefix if present.
|
|
seg = strings.TrimPrefix(seg, "chapter-")
|
|
// Also handle "chap-", "ch-" variants used by some sites.
|
|
seg = strings.TrimPrefix(seg, "chap-")
|
|
seg = strings.TrimPrefix(seg, "ch-")
|
|
// Take only the leading digits (handles slugs like "42-title-text").
|
|
digits := strings.FieldsFunc(seg, func(r rune) bool {
|
|
return r < '0' || r > '9'
|
|
})
|
|
if len(digits) == 0 {
|
|
return 0
|
|
}
|
|
n, _ := strconv.Atoi(digits[0])
|
|
return n
|
|
}
|