feat: add exponential backoff, some UI elements to see the resut of a scrape

This commit is contained in:
Admin
2026-02-26 18:51:32 +05:00
parent d68ea71239
commit e6e6f7dc4d
12 changed files with 462 additions and 153 deletions

28
.gitignore vendored Normal file
View File

@@ -0,0 +1,28 @@
# ── Compiled binary ────────────────────────────────────────────────────────────
/scraper
/scraper-*
# ── Go toolchain ───────────────────────────────────────────────────────────────
*.test
*.out
/vendor/
/dist/
# ── Scraped output (large, machine-generated) ──────────────────────────────────
/static/books
# ── Environment & secrets ──────────────────────────────────────────────────────
.env
.env.*
!.env.example
# ── OS artefacts ───────────────────────────────────────────────────────────────
.DS_Store
Thumbs.db
# ── Editor / IDE ───────────────────────────────────────────────────────────────
.idea/
.vscode/
*.swp
*.swo
*~

View File

@@ -13,9 +13,11 @@
// BROWSERLESS_URL Browserless base URL (default: http://localhost:3000)
// BROWSERLESS_TOKEN Browserless API token (default: "")
// BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
// BROWSERLESS_MAX_CONCURRENT Max simultaneous browser sessions (default: 5)
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
// SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
// LOG_LEVEL debug | info | warn | error (default: info)
package main
import (
@@ -36,8 +38,14 @@ import (
)
func main() {
logLevel := slog.LevelInfo
if v := os.Getenv("LOG_LEVEL"); v != "" {
if err := logLevel.UnmarshalText([]byte(v)); err != nil {
fmt.Fprintf(os.Stderr, "invalid LOG_LEVEL %q, using info\n", v)
}
}
log := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
Level: logLevel,
}))
if err := run(log); err != nil {
@@ -59,6 +67,12 @@ func run(log *slog.Logger) error {
BaseURL: envOr("BROWSERLESS_URL", "http://localhost:3000"),
Token: envOr("BROWSERLESS_TOKEN", ""),
}
browserCfg.MaxConcurrent = 5
if s := os.Getenv("BROWSERLESS_MAX_CONCURRENT"); s != "" {
if n, err := strconv.Atoi(s); err == nil && n > 0 {
browserCfg.MaxConcurrent = n
}
}
strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyContent))))
bc := newBrowserClient(strategy, browserCfg)
@@ -93,6 +107,7 @@ func run(log *slog.Logger) error {
log.Info("starting one-shot scrape",
"strategy", strategy,
"workers", workers,
"max_concurrent", browserCfg.MaxConcurrent,
"static_root", oCfg.StaticRoot,
"single_book", oCfg.SingleBookURL,
)
@@ -105,6 +120,7 @@ func run(log *slog.Logger) error {
"addr", addr,
"strategy", strategy,
"workers", workers,
"max_concurrent", browserCfg.MaxConcurrent,
)
srv := server.New(addr, oCfg, nf, log)
return srv.ListenAndServe(ctx)
@@ -143,8 +159,10 @@ Environment variables:
BROWSERLESS_URL Browserless base URL (default: http://localhost:3000)
BROWSERLESS_TOKEN API token (default: "")
BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
BROWSERLESS_MAX_CONCURRENT Max simultaneous sessions (default: 5)
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
LOG_LEVEL debug|info|warn|error (default: info)
`, runtime.NumCPU())
}

View File

@@ -4,6 +4,7 @@ go 1.25.0
require (
github.com/gorilla/websocket v1.5.3 // indirect
github.com/yuin/goldmark v1.7.16 // indirect
golang.org/x/net v0.51.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

View File

@@ -1,5 +1,7 @@
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE=
github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View File

@@ -15,6 +15,7 @@ import (
// cdpClient implements BrowserClient using the CDP WebSocket endpoint.
type cdpClient struct {
cfg Config
sem chan struct{}
}
// NewCDPClient returns a BrowserClient that uses CDP WebSocket sessions.
@@ -22,7 +23,7 @@ func NewCDPClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 60 * time.Second
}
return &cdpClient{cfg: cfg}
return &cdpClient{cfg: cfg, sem: makeSem(cfg.MaxConcurrent)}
}
func (c *cdpClient) Strategy() Strategy { return StrategyCDP }
@@ -38,6 +39,11 @@ func (c *cdpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeRespon
// CDPSession opens a WebSocket to the Browserless /devtools/browser endpoint,
// navigates to pageURL, and invokes fn with a live CDPConn.
func (c *cdpClient) CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error {
if err := acquire(ctx, c.sem); err != nil {
return fmt.Errorf("cdp: semaphore: %w", err)
}
defer release(c.sem)
// Build WebSocket URL: ws://host:port/devtools/browser?token=...&url=...
wsURL := strings.Replace(c.cfg.BaseURL, "http://", "ws://", 1)
wsURL = strings.Replace(wsURL, "https://", "wss://", 1)

View File

@@ -18,12 +18,48 @@ type Config struct {
Token string
// Timeout is the per-request HTTP timeout; defaults to 60 s.
Timeout time.Duration
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
// Browserless. When all slots are occupied new calls block until one
// completes (or ctx is cancelled). 0 means no limit.
MaxConcurrent int
}
// makeSem returns a buffered channel used as a counting semaphore.
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
func makeSem(n int) chan struct{} {
if n <= 0 {
return nil
}
return make(chan struct{}, n)
}
// acquire takes one slot from sem. It returns an error if ctx is cancelled
// before a slot becomes available. If sem is nil it returns immediately.
func acquire(ctx context.Context, sem chan struct{}) error {
if sem == nil {
return nil
}
select {
case sem <- struct{}{}:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// release frees the slot previously obtained by acquire.
// If sem is nil it is a no-op.
func release(sem chan struct{}) {
if sem != nil {
<-sem
}
}
// contentClient implements BrowserClient using the /content endpoint.
type contentClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewContentClient returns a BrowserClient that uses POST /content.
@@ -34,12 +70,18 @@ func NewContentClient(cfg Config) BrowserClient {
return &contentClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *contentClient) Strategy() Strategy { return StrategyContent }
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
if err := acquire(ctx, c.sem); err != nil {
return "", fmt.Errorf("content: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return "", fmt.Errorf("content: marshal request: %w", err)
@@ -87,6 +129,7 @@ func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc
type scrapeClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
@@ -97,6 +140,7 @@ func NewScrapeClient(cfg Config) BrowserClient {
return &scrapeClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
@@ -107,6 +151,11 @@ func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string,
}
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
if err := acquire(ctx, c.sem); err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)

View File

@@ -23,12 +23,18 @@ const (
StrategyCDP Strategy = "cdp"
)
// WaitForSelector describes the waitForSelector option sent to Browserless.
type WaitForSelector struct {
Selector string `json:"selector"`
Timeout int `json:"timeout,omitempty"` // ms
}
// ContentRequest is the body sent to POST /content.
type ContentRequest struct {
URL string `json:"url"`
WaitFor string `json:"waitForSelector,omitempty"`
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
RejectResources bool `json:"rejectResources,omitempty"`
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
}
// ScrapeElement is one element descriptor inside a ScrapeRequest.
@@ -41,7 +47,7 @@ type ScrapeElement struct {
type ScrapeRequest struct {
URL string `json:"url"`
Elements []ScrapeElement `json:"elements"`
WaitFor string `json:"waitForSelector,omitempty"`
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
}
// ScrapeResult is one entry in the response from POST /scrape.

View File

@@ -15,6 +15,7 @@ import (
"net/url"
"strconv"
"strings"
"time"
"github.com/libnovel/scraper/internal/browser"
"github.com/libnovel/scraper/internal/scraper"
@@ -26,6 +27,27 @@ const (
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
)
// rejectResourceTypes lists Browserless resource types to block on every request.
// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
// Everything else is safe to drop for HTML-only scraping.
var rejectResourceTypes = []string{
"cspviolationreport",
"eventsource",
"fedcm",
"font",
"image",
"manifest",
"media",
"other",
"ping",
"prefetch",
"preflight",
"signedexchange",
"stylesheet",
"texttrack",
"websocket",
}
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
// It uses the /content strategy by default (rendered HTML via Browserless).
type Scraper struct {
@@ -46,20 +68,6 @@ func (s *Scraper) SourceName() string { return "novelfire.net" }
// ─── CatalogueProvider ───────────────────────────────────────────────────────
func (s *Scraper) CatalogueURL() string {
return baseURL + cataloguePath
}
func (s *Scraper) EntriesSelector() scraper.Selector {
// Each novel card: <div class="novel-item">
return scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true}
}
func (s *Scraper) NextPageSelector() scraper.Selector {
// <a class="next" href="...">
return scraper.Selector{Tag: "a", Class: "next", Attr: "href"}
}
// ScrapeCatalogue streams all CatalogueEntry values across all pages.
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
entries := make(chan scraper.CatalogueEntry, 64)
@@ -69,7 +77,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
defer close(entries)
defer close(errs)
pageURL := s.CatalogueURL()
pageURL := baseURL + cataloguePath
page := 1
for pageURL != "" {
@@ -80,17 +88,34 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
}
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
s.log.Debug("catalogue page fetch starting",
"page", page,
"payload_url", pageURL,
"payload_wait_selector", ".novel-item",
"payload_wait_selector_timeout_ms", 10000,
"payload_wait_for_timeout_ms", 10000,
)
html, err := s.client.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
WaitFor: ".novel-item",
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 10000},
WaitForTimeout: 10000,
RejectResources: true,
RejectResourceTypes: rejectResourceTypes,
})
if err != nil {
s.log.Debug("catalogue page fetch failed",
"page", page,
"url", pageURL,
"err", err,
)
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
return
}
s.log.Debug("catalogue page fetch completed",
"page", page,
"url", pageURL,
"response_bytes", len(html),
)
root, err := htmlutil.ParseHTML(html)
if err != nil {
@@ -98,8 +123,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
return
}
// Extract novel cards.
cards := htmlutil.FindAll(root, s.EntriesSelector())
// Extract novel cards: <div class="novel-item">
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
if len(cards) == 0 {
s.log.Warn("no novel cards found, stopping pagination", "page", page)
return
@@ -107,8 +132,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
for _, card := range cards {
// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
titleSel := scraper.Selector{Tag: "h3", Class: "novel-title"}
titleNode := htmlutil.FindFirst(card, titleSel)
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})
var title, href string
if titleNode != nil {
@@ -122,7 +146,6 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
continue
}
// Resolve relative URL.
bookURL := resolveURL(baseURL, href)
select {
case <-ctx.Done():
@@ -131,8 +154,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
}
}
// Find next page link.
nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
// Find next page link: <a class="next" href="...">
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
if nextHref == "" {
break
}
@@ -146,64 +169,56 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
// ─── MetadataProvider ────────────────────────────────────────────────────────
func (s *Scraper) MetadataSelectors() map[string]scraper.Selector {
return map[string]scraper.Selector{
// <h1 class="novel-title">Title</h1>
"title": {Tag: "h1", Class: "novel-title"},
// <span class="author"><a>Author Name</a></span>
"author": {Tag: "span", Class: "author"},
// <img class="cover" src="...">
"cover": {Tag: "img", Class: "cover", Attr: "src"},
// <span class="status">Ongoing</span>
"status": {Tag: "span", Class: "status"},
// <div class="genres"><a>Tag1</a><a>Tag2</a>…</div>
"genres": {Tag: "div", Class: "genres", Multiple: true},
// <div class="summary"><p>...</p></div>
"summary": {Tag: "div", Class: "summary"},
// <span class="chapter-count">123 Chapters</span>
"total_chapters": {Tag: "span", Class: "chapter-count"},
}
}
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
s.log.Debug("metadata fetch starting",
"payload_url", bookURL,
"payload_wait_selector", ".novel-title",
"payload_wait_selector_timeout_ms", 10000,
"payload_wait_for_timeout_ms", 10000,
)
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
URL: bookURL,
WaitFor: ".novel-title",
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 10000},
WaitForTimeout: 10000,
RejectResources: true,
RejectResourceTypes: rejectResourceTypes,
})
if err != nil {
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
}
s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
}
sels := s.MetadataSelectors()
// <h1 class="novel-title">Title</h1>
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
// <span class="author"><a>Author Name</a></span>
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
// <img class="cover" src="...">
cover := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "img", Class: "cover", Attr: "src"})
// <span class="status">Ongoing</span>
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
title := htmlutil.ExtractFirst(root, sels["title"])
author := htmlutil.ExtractFirst(root, sels["author"])
cover := htmlutil.ExtractFirst(root, sels["cover"])
status := htmlutil.ExtractFirst(root, sels["status"])
// Genres: all <a> tags inside the genres div.
genresNode := htmlutil.FindFirst(root, sels["genres"])
// Genres: all <a> tags inside <div class="genres">
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
var genres []string
if genresNode != nil {
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
}
summary := htmlutil.ExtractFirst(root, sels["summary"])
totalStr := htmlutil.ExtractFirst(root, sels["total_chapters"])
// <div class="summary"><p>...</p></div>
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
// <span class="chapter-count">123 Chapters</span>
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
totalChapters := parseChapterCount(totalStr)
// Derive slug from URL.
slug := slugFromURL(bookURL)
return scraper.BookMeta{
meta := scraper.BookMeta{
Slug: slug,
Title: title,
Author: author,
@@ -213,23 +228,25 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
Summary: summary,
TotalChapters: totalChapters,
SourceURL: bookURL,
}, nil
}
s.log.Debug("metadata parsed",
"url", bookURL,
"slug", meta.Slug,
"title", meta.Title,
"author", meta.Author,
"status", meta.Status,
"genres", meta.Genres,
"total_chapters", meta.TotalChapters,
)
return meta, nil
}
// ─── ChapterListProvider ─────────────────────────────────────────────────────
func (s *Scraper) ChaptersURL(bookURL string) string {
return strings.TrimRight(bookURL, "/") + "/chapters"
}
func (s *Scraper) ChapterEntrySelector() scraper.Selector {
// <li class="chapter-item"><a href="/book/slug/chapter-1">Chapter 1: Title</a></li>
return scraper.Selector{Tag: "li", Class: "chapter-item", Multiple: true}
}
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
var refs []scraper.ChapterRef
pageURL := s.ChaptersURL(bookURL)
// Chapter list URL: {bookURL}/chapters
pageURL := strings.TrimRight(bookURL, "/") + "/chapters"
page := 1
for pageURL != "" {
@@ -241,28 +258,51 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
s.log.Debug("chapter list fetch starting",
"page", page,
"payload_url", pageURL,
"payload_wait_selector", ".chapter-list",
"payload_wait_selector_timeout_ms", 10000,
"payload_wait_for_timeout_ms", 10000,
)
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
URL: pageURL,
WaitFor: ".chapter-item",
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 10000},
WaitForTimeout: 10000,
RejectResources: true,
RejectResourceTypes: rejectResourceTypes,
})
if err != nil {
s.log.Debug("chapter list fetch failed",
"page", page,
"url", pageURL,
"err", err,
)
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
}
s.log.Debug("chapter list fetch completed",
"page", page,
"url", pageURL,
"response_bytes", len(raw),
)
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
}
items := htmlutil.FindAll(root, s.ChapterEntrySelector())
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
if chapterList == nil {
break
}
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
for _, item := range items {
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a", Attr: "href"})
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
if linkNode == nil {
continue
}
href := htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
if href == "" {
continue
@@ -276,8 +316,15 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
})
}
// Next page.
nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
s.log.Debug("chapter list page parsed",
"page", page,
"url", pageURL,
"chapters_on_page", len(items),
"total_refs_so_far", len(refs),
)
// Next page: <a class="next" href="...">
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
if nextHref == "" {
break
}
@@ -290,38 +337,105 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
func (s *Scraper) ChapterTextSelector() scraper.Selector {
// <div id="chapter-container"> or <div class="chapter-content">
return scraper.Selector{Tag: "div", ID: "chapter-container"}
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
// exponentially between retries. Only errors that look like transient Browserless
// 5xx responses (navigation timeouts, etc.) are retried; context cancellation and
// permanent errors are returned immediately.
func retryGetContent(
ctx context.Context,
log *slog.Logger,
client browser.BrowserClient,
req browser.ContentRequest,
maxAttempts int,
baseDelay time.Duration,
) (string, error) {
var lastErr error
delay := baseDelay
for attempt := 1; attempt <= maxAttempts; attempt++ {
html, err := client.GetContent(ctx, req)
if err == nil {
return html, nil
}
lastErr = err
// Stop immediately on context cancellation.
if ctx.Err() != nil {
return "", err
}
// Only retry on Browserless 5xx responses.
if !strings.Contains(err.Error(), "unexpected status 5") {
return "", err
}
if attempt < maxAttempts {
log.Warn("chapter fetch failed, retrying",
"url", req.URL,
"attempt", attempt,
"max_attempts", maxAttempts,
"retry_in", delay,
"err", err,
)
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(delay):
}
delay *= 2
}
}
return "", lastErr
}
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
s.log.Debug("chapter text fetch starting",
"chapter", ref.Number,
"title", ref.Title,
"payload_url", ref.URL,
"payload_wait_selector", "#content",
"payload_wait_selector_timeout_ms", 75000,
"payload_wait_for_timeout_ms", 75000,
)
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
URL: ref.URL,
WaitFor: "#chapter-container",
WaitForTimeout: 15000,
RejectResources: true,
})
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 75000},
WaitForTimeout: 75000,
RejectResourceTypes: rejectResourceTypes,
}, 9, 6*time.Second)
if err != nil {
s.log.Debug("chapter text fetch failed",
"chapter", ref.Number,
"url", ref.URL,
"err", err,
)
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
}
s.log.Debug("chapter text fetch completed",
"chapter", ref.Number,
"url", ref.URL,
"response_bytes", len(raw),
)
root, err := htmlutil.ParseHTML(raw)
if err != nil {
return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
}
container := htmlutil.FindFirst(root, s.ChapterTextSelector())
// <div id="content">…</div>
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
if container == nil {
// Fallback: try class-based selector.
container = htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "chapter-content"})
}
if container == nil {
return scraper.Chapter{}, fmt.Errorf("chapter %d: content container not found in %s", ref.Number, ref.URL)
return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
}
text := htmlutil.NodeToMarkdown(container)
s.log.Debug("chapter text parsed",
"chapter", ref.Number,
"url", ref.URL,
"text_bytes", len(text),
)
return scraper.Chapter{
Ref: ref,
Text: text,

View File

@@ -3,6 +3,7 @@
package htmlutil
import (
"regexp"
"strings"
"github.com/libnovel/scraper/internal/scraper"
@@ -150,13 +151,20 @@ func InnerHTML(n *html.Node) string {
// NodeToMarkdown converts the children of an HTML node to a plain-text/Markdown
// representation suitable for chapter storage. Block elements become newlines;
// inline elements are inlined.
// inline elements are inlined. Runs of more than one blank line are collapsed
// to a single blank line.
func NodeToMarkdown(n *html.Node) string {
var sb strings.Builder
nodeToMD(n, &sb)
return strings.TrimSpace(sb.String())
// Collapse 3+ consecutive newlines (i.e. more than one blank line) to 2.
out := multiBlankLine.ReplaceAllString(sb.String(), "\n\n")
return strings.TrimSpace(out)
}
// multiBlankLine matches three or more consecutive newline characters
// (any mix of \n and surrounding whitespace-only lines).
var multiBlankLine = regexp.MustCompile(`\n(\s*\n){2,}`)
var blockElements = map[string]bool{
"p": true, "div": true, "br": true, "h1": true, "h2": true,
"h3": true, "h4": true, "h5": true, "h6": true, "li": true,

View File

@@ -81,18 +81,6 @@ type Selector struct {
// CatalogueProvider can enumerate every novel available on a source site.
// It handles pagination transparently and streams CatalogueEntry values.
type CatalogueProvider interface {
// CatalogueURL returns the root URL of the catalogue listing.
CatalogueURL() string
// EntriesSelector returns the selector that matches each novel card / row
// in the catalogue listing page.
EntriesSelector() Selector
// NextPageSelector returns the selector for the "next page" link.
// If the current page has no next page the implementation must return
// ("", nil) from ScrapeNextPage.
NextPageSelector() Selector
// ScrapeCatalogue pages through the entire catalogue, sending
// CatalogueEntry values to the returned channel. The channel is closed
// when all pages have been scraped or ctx is cancelled.
@@ -103,25 +91,12 @@ type CatalogueProvider interface {
// MetadataProvider can extract structured book metadata from a novel's landing page.
type MetadataProvider interface {
// MetadataSelectors returns a map of field name → Selector used to
// locate each metadata element on the book page.
// Required keys: "title", "author".
// Optional keys: "cover", "status", "genres", "summary", "total_chapters".
MetadataSelectors() map[string]Selector
// ScrapeMetadata fetches and parses the metadata for the book at bookURL.
ScrapeMetadata(ctx context.Context, bookURL string) (BookMeta, error)
}
// ChapterListProvider can enumerate all chapters of a book from the chapter-list page.
type ChapterListProvider interface {
// ChaptersURL derives the chapter-list URL from a book landing-page URL.
ChaptersURL(bookURL string) string
// ChapterEntrySelector returns the selector that matches each chapter row
// in the chapter list page.
ChapterEntrySelector() Selector
// ScrapeChapterList returns all chapter references for a book, ordered
// by chapter number ascending.
ScrapeChapterList(ctx context.Context, bookURL string) ([]ChapterRef, error)
@@ -129,9 +104,6 @@ type ChapterListProvider interface {
// ChapterTextProvider can extract the readable text from a single chapter page.
type ChapterTextProvider interface {
// ChapterTextSelector returns the selector that wraps the chapter body.
ChapterTextSelector() Selector
// ScrapeChapterText fetches chapterURL and returns the chapter text as Markdown.
ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error)
}

View File

@@ -18,6 +18,7 @@ import (
"github.com/libnovel/scraper/internal/orchestrator"
"github.com/libnovel/scraper/internal/scraper"
"github.com/libnovel/scraper/internal/writer"
)
// Server wraps an HTTP mux with the scraping endpoints.
@@ -26,6 +27,7 @@ type Server struct {
oCfg orchestrator.Config
novel scraper.NovelScraper
log *slog.Logger
writer *writer.Writer
mu sync.Mutex
running bool
}
@@ -37,6 +39,7 @@ func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log
oCfg: oCfg,
novel: novel,
log: log,
writer: writer.New(oCfg.StaticRoot),
}
}
@@ -47,6 +50,12 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
mux.HandleFunc("GET /health", s.handleHealth)
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
// UI routes
mux.HandleFunc("GET /", s.handleHome)
mux.HandleFunc("GET /books/{slug}", s.handleBook)
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
srv := &http.Server{
Addr: s.addr,

View File

@@ -19,6 +19,8 @@ import (
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"github.com/libnovel/scraper/internal/scraper"
@@ -113,7 +115,101 @@ func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
return nil
}
// ─── Path helpers ─────────────────────────────────────────────────────────────
// ─── Catalogue helpers ────────────────────────────────────────────────────────
// ListBooks returns metadata for every book that has a metadata.yaml under root.
// Books with unreadable metadata files are silently skipped.
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
entries, err := os.ReadDir(w.root)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("writer: list books: %w", err)
}
var books []scraper.BookMeta
for _, e := range entries {
if !e.IsDir() {
continue
}
meta, ok, _ := w.ReadMetadata(e.Name())
if !ok {
continue
}
books = append(books, meta)
}
sort.Slice(books, func(i, j int) bool {
return books[i].Title < books[j].Title
})
return books, nil
}
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
type ChapterInfo struct {
Number int
Title string // first line of the markdown file (without the leading "# ")
}
// ListChapters returns all chapters on disk for slug, sorted by number.
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
bookDir := w.bookDir(slug)
var chapters []ChapterInfo
// Walk vol-*/range-*/ directories.
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
if err != nil {
return nil, fmt.Errorf("writer: list chapters glob: %w", err)
}
for _, vd := range volDirs {
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
for _, rd := range rangeDirs {
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
for _, f := range files {
base := filepath.Base(f) // chapter-N.md
numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
n, err := strconv.Atoi(numStr)
if err != nil {
continue
}
title := chapterTitle(f, n)
chapters = append(chapters, ChapterInfo{Number: n, Title: title})
}
}
}
sort.Slice(chapters, func(i, j int) bool {
return chapters[i].Number < chapters[j].Number
})
return chapters, nil
}
// chapterTitle reads the first non-empty line of a markdown file and strips
// the leading "# " heading marker. Falls back to "Chapter N".
func chapterTitle(path string, n int) string {
data, err := os.ReadFile(path)
if err != nil {
return fmt.Sprintf("Chapter %d", n)
}
for _, line := range strings.SplitN(string(data), "\n", 10) {
line = strings.TrimSpace(line)
if line == "" {
continue
}
return strings.TrimPrefix(line, "# ")
}
return fmt.Sprintf("Chapter %d", n)
}
// ReadChapter returns the raw markdown content for chapter number n of slug.
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
// Reconstruct path using the same bucketing formula as chapterPath.
ref := scraper.ChapterRef{Number: n, Volume: 0}
path := w.chapterPath(slug, ref)
data, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
}
return string(data), nil
}
// bookDir returns the root directory for a book slug.
func (w *Writer) bookDir(slug string) string {