feat: add exponential backoff, some UI elements to see the resut of a scrape
This commit is contained in:
28
.gitignore
vendored
Normal file
28
.gitignore
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
# ── Compiled binary ────────────────────────────────────────────────────────────
|
||||
/scraper
|
||||
/scraper-*
|
||||
|
||||
# ── Go toolchain ───────────────────────────────────────────────────────────────
|
||||
*.test
|
||||
*.out
|
||||
/vendor/
|
||||
/dist/
|
||||
|
||||
# ── Scraped output (large, machine-generated) ──────────────────────────────────
|
||||
|
||||
/static/books
|
||||
# ── Environment & secrets ──────────────────────────────────────────────────────
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# ── OS artefacts ───────────────────────────────────────────────────────────────
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# ── Editor / IDE ───────────────────────────────────────────────────────────────
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
@@ -13,9 +13,11 @@
|
||||
// BROWSERLESS_URL Browserless base URL (default: http://localhost:3000)
|
||||
// BROWSERLESS_TOKEN Browserless API token (default: "")
|
||||
// BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
|
||||
// BROWSERLESS_MAX_CONCURRENT Max simultaneous browser sessions (default: 5)
|
||||
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
|
||||
// SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
// LOG_LEVEL debug | info | warn | error (default: info)
|
||||
package main
|
||||
|
||||
import (
|
||||
@@ -36,8 +38,14 @@ import (
|
||||
)
|
||||
|
||||
func main() {
|
||||
logLevel := slog.LevelInfo
|
||||
if v := os.Getenv("LOG_LEVEL"); v != "" {
|
||||
if err := logLevel.UnmarshalText([]byte(v)); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "invalid LOG_LEVEL %q, using info\n", v)
|
||||
}
|
||||
}
|
||||
log := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
Level: logLevel,
|
||||
}))
|
||||
|
||||
if err := run(log); err != nil {
|
||||
@@ -59,6 +67,12 @@ func run(log *slog.Logger) error {
|
||||
BaseURL: envOr("BROWSERLESS_URL", "http://localhost:3000"),
|
||||
Token: envOr("BROWSERLESS_TOKEN", ""),
|
||||
}
|
||||
browserCfg.MaxConcurrent = 5
|
||||
if s := os.Getenv("BROWSERLESS_MAX_CONCURRENT"); s != "" {
|
||||
if n, err := strconv.Atoi(s); err == nil && n > 0 {
|
||||
browserCfg.MaxConcurrent = n
|
||||
}
|
||||
}
|
||||
|
||||
strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyContent))))
|
||||
bc := newBrowserClient(strategy, browserCfg)
|
||||
@@ -93,6 +107,7 @@ func run(log *slog.Logger) error {
|
||||
log.Info("starting one-shot scrape",
|
||||
"strategy", strategy,
|
||||
"workers", workers,
|
||||
"max_concurrent", browserCfg.MaxConcurrent,
|
||||
"static_root", oCfg.StaticRoot,
|
||||
"single_book", oCfg.SingleBookURL,
|
||||
)
|
||||
@@ -105,6 +120,7 @@ func run(log *slog.Logger) error {
|
||||
"addr", addr,
|
||||
"strategy", strategy,
|
||||
"workers", workers,
|
||||
"max_concurrent", browserCfg.MaxConcurrent,
|
||||
)
|
||||
srv := server.New(addr, oCfg, nf, log)
|
||||
return srv.ListenAndServe(ctx)
|
||||
@@ -143,8 +159,10 @@ Environment variables:
|
||||
BROWSERLESS_URL Browserless base URL (default: http://localhost:3000)
|
||||
BROWSERLESS_TOKEN API token (default: "")
|
||||
BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
|
||||
BROWSERLESS_MAX_CONCURRENT Max simultaneous sessions (default: 5)
|
||||
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
|
||||
SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
LOG_LEVEL debug|info|warn|error (default: info)
|
||||
`, runtime.NumCPU())
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/yuin/goldmark v1.7.16 // indirect
|
||||
golang.org/x/net v0.51.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE=
|
||||
github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
// cdpClient implements BrowserClient using the CDP WebSocket endpoint.
|
||||
type cdpClient struct {
|
||||
cfg Config
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewCDPClient returns a BrowserClient that uses CDP WebSocket sessions.
|
||||
@@ -22,7 +23,7 @@ func NewCDPClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 60 * time.Second
|
||||
}
|
||||
return &cdpClient{cfg: cfg}
|
||||
return &cdpClient{cfg: cfg, sem: makeSem(cfg.MaxConcurrent)}
|
||||
}
|
||||
|
||||
func (c *cdpClient) Strategy() Strategy { return StrategyCDP }
|
||||
@@ -38,6 +39,11 @@ func (c *cdpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeRespon
|
||||
// CDPSession opens a WebSocket to the Browserless /devtools/browser endpoint,
|
||||
// navigates to pageURL, and invokes fn with a live CDPConn.
|
||||
func (c *cdpClient) CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return fmt.Errorf("cdp: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
// Build WebSocket URL: ws://host:port/devtools/browser?token=...&url=...
|
||||
wsURL := strings.Replace(c.cfg.BaseURL, "http://", "ws://", 1)
|
||||
wsURL = strings.Replace(wsURL, "https://", "wss://", 1)
|
||||
|
||||
@@ -18,12 +18,48 @@ type Config struct {
|
||||
Token string
|
||||
// Timeout is the per-request HTTP timeout; defaults to 60 s.
|
||||
Timeout time.Duration
|
||||
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
|
||||
// Browserless. When all slots are occupied new calls block until one
|
||||
// completes (or ctx is cancelled). 0 means no limit.
|
||||
MaxConcurrent int
|
||||
}
|
||||
|
||||
// makeSem returns a buffered channel used as a counting semaphore.
|
||||
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
|
||||
func makeSem(n int) chan struct{} {
|
||||
if n <= 0 {
|
||||
return nil
|
||||
}
|
||||
return make(chan struct{}, n)
|
||||
}
|
||||
|
||||
// acquire takes one slot from sem. It returns an error if ctx is cancelled
|
||||
// before a slot becomes available. If sem is nil it returns immediately.
|
||||
func acquire(ctx context.Context, sem chan struct{}) error {
|
||||
if sem == nil {
|
||||
return nil
|
||||
}
|
||||
select {
|
||||
case sem <- struct{}{}:
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
// release frees the slot previously obtained by acquire.
|
||||
// If sem is nil it is a no-op.
|
||||
func release(sem chan struct{}) {
|
||||
if sem != nil {
|
||||
<-sem
|
||||
}
|
||||
}
|
||||
|
||||
// contentClient implements BrowserClient using the /content endpoint.
|
||||
type contentClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewContentClient returns a BrowserClient that uses POST /content.
|
||||
@@ -34,12 +70,18 @@ func NewContentClient(cfg Config) BrowserClient {
|
||||
return &contentClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *contentClient) Strategy() Strategy { return StrategyContent }
|
||||
|
||||
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return "", fmt.Errorf("content: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("content: marshal request: %w", err)
|
||||
@@ -87,6 +129,7 @@ func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc
|
||||
type scrapeClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
|
||||
@@ -97,6 +140,7 @@ func NewScrapeClient(cfg Config) BrowserClient {
|
||||
return &scrapeClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,6 +151,11 @@ func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string,
|
||||
}
|
||||
|
||||
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
|
||||
|
||||
@@ -23,12 +23,18 @@ const (
|
||||
StrategyCDP Strategy = "cdp"
|
||||
)
|
||||
|
||||
// WaitForSelector describes the waitForSelector option sent to Browserless.
|
||||
type WaitForSelector struct {
|
||||
Selector string `json:"selector"`
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
}
|
||||
|
||||
// ContentRequest is the body sent to POST /content.
|
||||
type ContentRequest struct {
|
||||
URL string `json:"url"`
|
||||
WaitFor string `json:"waitForSelector,omitempty"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
|
||||
RejectResources bool `json:"rejectResources,omitempty"`
|
||||
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
|
||||
}
|
||||
|
||||
// ScrapeElement is one element descriptor inside a ScrapeRequest.
|
||||
@@ -41,7 +47,7 @@ type ScrapeElement struct {
|
||||
type ScrapeRequest struct {
|
||||
URL string `json:"url"`
|
||||
Elements []ScrapeElement `json:"elements"`
|
||||
WaitFor string `json:"waitForSelector,omitempty"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResult is one entry in the response from POST /scrape.
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
@@ -26,6 +27,27 @@ const (
|
||||
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
|
||||
)
|
||||
|
||||
// rejectResourceTypes lists Browserless resource types to block on every request.
|
||||
// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
|
||||
// Everything else is safe to drop for HTML-only scraping.
|
||||
var rejectResourceTypes = []string{
|
||||
"cspviolationreport",
|
||||
"eventsource",
|
||||
"fedcm",
|
||||
"font",
|
||||
"image",
|
||||
"manifest",
|
||||
"media",
|
||||
"other",
|
||||
"ping",
|
||||
"prefetch",
|
||||
"preflight",
|
||||
"signedexchange",
|
||||
"stylesheet",
|
||||
"texttrack",
|
||||
"websocket",
|
||||
}
|
||||
|
||||
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
|
||||
// It uses the /content strategy by default (rendered HTML via Browserless).
|
||||
type Scraper struct {
|
||||
@@ -46,20 +68,6 @@ func (s *Scraper) SourceName() string { return "novelfire.net" }
|
||||
|
||||
// ─── CatalogueProvider ───────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) CatalogueURL() string {
|
||||
return baseURL + cataloguePath
|
||||
}
|
||||
|
||||
func (s *Scraper) EntriesSelector() scraper.Selector {
|
||||
// Each novel card: <div class="novel-item">
|
||||
return scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true}
|
||||
}
|
||||
|
||||
func (s *Scraper) NextPageSelector() scraper.Selector {
|
||||
// <a class="next" href="...">
|
||||
return scraper.Selector{Tag: "a", Class: "next", Attr: "href"}
|
||||
}
|
||||
|
||||
// ScrapeCatalogue streams all CatalogueEntry values across all pages.
|
||||
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
|
||||
entries := make(chan scraper.CatalogueEntry, 64)
|
||||
@@ -69,7 +77,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
pageURL := s.CatalogueURL()
|
||||
pageURL := baseURL + cataloguePath
|
||||
page := 1
|
||||
|
||||
for pageURL != "" {
|
||||
@@ -80,17 +88,34 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
}
|
||||
|
||||
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
|
||||
s.log.Debug("catalogue page fetch starting",
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".novel-item",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
)
|
||||
|
||||
html, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: ".novel-item",
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
RejectResources: true,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("catalogue page fetch failed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"err", err,
|
||||
)
|
||||
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
|
||||
return
|
||||
}
|
||||
s.log.Debug("catalogue page fetch completed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"response_bytes", len(html),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(html)
|
||||
if err != nil {
|
||||
@@ -98,8 +123,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
return
|
||||
}
|
||||
|
||||
// Extract novel cards.
|
||||
cards := htmlutil.FindAll(root, s.EntriesSelector())
|
||||
// Extract novel cards: <div class="novel-item">
|
||||
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
|
||||
if len(cards) == 0 {
|
||||
s.log.Warn("no novel cards found, stopping pagination", "page", page)
|
||||
return
|
||||
@@ -107,8 +132,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
|
||||
for _, card := range cards {
|
||||
// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
|
||||
titleSel := scraper.Selector{Tag: "h3", Class: "novel-title"}
|
||||
titleNode := htmlutil.FindFirst(card, titleSel)
|
||||
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})
|
||||
|
||||
var title, href string
|
||||
if titleNode != nil {
|
||||
@@ -122,7 +146,6 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
continue
|
||||
}
|
||||
|
||||
// Resolve relative URL.
|
||||
bookURL := resolveURL(baseURL, href)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -131,8 +154,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
}
|
||||
}
|
||||
|
||||
// Find next page link.
|
||||
nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
|
||||
// Find next page link: <a class="next" href="...">
|
||||
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
|
||||
if nextHref == "" {
|
||||
break
|
||||
}
|
||||
@@ -146,64 +169,56 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
|
||||
|
||||
// ─── MetadataProvider ────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) MetadataSelectors() map[string]scraper.Selector {
|
||||
return map[string]scraper.Selector{
|
||||
// <h1 class="novel-title">Title</h1>
|
||||
"title": {Tag: "h1", Class: "novel-title"},
|
||||
// <span class="author"><a>Author Name</a></span>
|
||||
"author": {Tag: "span", Class: "author"},
|
||||
// <img class="cover" src="...">
|
||||
"cover": {Tag: "img", Class: "cover", Attr: "src"},
|
||||
// <span class="status">Ongoing</span>
|
||||
"status": {Tag: "span", Class: "status"},
|
||||
// <div class="genres"><a>Tag1</a><a>Tag2</a>…</div>
|
||||
"genres": {Tag: "div", Class: "genres", Multiple: true},
|
||||
// <div class="summary"><p>...</p></div>
|
||||
"summary": {Tag: "div", Class: "summary"},
|
||||
// <span class="chapter-count">123 Chapters</span>
|
||||
"total_chapters": {Tag: "span", Class: "chapter-count"},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
|
||||
s.log.Debug("metadata fetch starting",
|
||||
"payload_url", bookURL,
|
||||
"payload_wait_selector", ".novel-title",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: bookURL,
|
||||
WaitFor: ".novel-title",
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
RejectResources: true,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
|
||||
return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
|
||||
}
|
||||
s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
|
||||
}
|
||||
|
||||
sels := s.MetadataSelectors()
|
||||
// <h1 class="novel-title">Title</h1>
|
||||
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
|
||||
// <span class="author"><a>Author Name</a></span>
|
||||
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
|
||||
// <img class="cover" src="...">
|
||||
cover := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "img", Class: "cover", Attr: "src"})
|
||||
// <span class="status">Ongoing</span>
|
||||
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
|
||||
|
||||
title := htmlutil.ExtractFirst(root, sels["title"])
|
||||
author := htmlutil.ExtractFirst(root, sels["author"])
|
||||
cover := htmlutil.ExtractFirst(root, sels["cover"])
|
||||
status := htmlutil.ExtractFirst(root, sels["status"])
|
||||
|
||||
// Genres: all <a> tags inside the genres div.
|
||||
genresNode := htmlutil.FindFirst(root, sels["genres"])
|
||||
// Genres: all <a> tags inside <div class="genres">
|
||||
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
|
||||
var genres []string
|
||||
if genresNode != nil {
|
||||
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
|
||||
}
|
||||
|
||||
summary := htmlutil.ExtractFirst(root, sels["summary"])
|
||||
|
||||
totalStr := htmlutil.ExtractFirst(root, sels["total_chapters"])
|
||||
// <div class="summary"><p>...</p></div>
|
||||
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
|
||||
// <span class="chapter-count">123 Chapters</span>
|
||||
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
|
||||
totalChapters := parseChapterCount(totalStr)
|
||||
|
||||
// Derive slug from URL.
|
||||
slug := slugFromURL(bookURL)
|
||||
|
||||
return scraper.BookMeta{
|
||||
meta := scraper.BookMeta{
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Author: author,
|
||||
@@ -213,23 +228,25 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
|
||||
Summary: summary,
|
||||
TotalChapters: totalChapters,
|
||||
SourceURL: bookURL,
|
||||
}, nil
|
||||
}
|
||||
s.log.Debug("metadata parsed",
|
||||
"url", bookURL,
|
||||
"slug", meta.Slug,
|
||||
"title", meta.Title,
|
||||
"author", meta.Author,
|
||||
"status", meta.Status,
|
||||
"genres", meta.Genres,
|
||||
"total_chapters", meta.TotalChapters,
|
||||
)
|
||||
return meta, nil
|
||||
}
|
||||
|
||||
// ─── ChapterListProvider ─────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) ChaptersURL(bookURL string) string {
|
||||
return strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
}
|
||||
|
||||
func (s *Scraper) ChapterEntrySelector() scraper.Selector {
|
||||
// <li class="chapter-item"><a href="/book/slug/chapter-1">Chapter 1: Title</a></li>
|
||||
return scraper.Selector{Tag: "li", Class: "chapter-item", Multiple: true}
|
||||
}
|
||||
|
||||
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
|
||||
var refs []scraper.ChapterRef
|
||||
pageURL := s.ChaptersURL(bookURL)
|
||||
// Chapter list URL: {bookURL}/chapters
|
||||
pageURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
page := 1
|
||||
|
||||
for pageURL != "" {
|
||||
@@ -241,28 +258,51 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
|
||||
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
|
||||
|
||||
s.log.Debug("chapter list fetch starting",
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".chapter-list",
|
||||
"payload_wait_selector_timeout_ms", 10000,
|
||||
"payload_wait_for_timeout_ms", 10000,
|
||||
)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: ".chapter-item",
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 10000},
|
||||
WaitForTimeout: 10000,
|
||||
RejectResources: true,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("chapter list fetch failed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"err", err,
|
||||
)
|
||||
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
|
||||
}
|
||||
s.log.Debug("chapter list fetch completed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"response_bytes", len(raw),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
|
||||
}
|
||||
|
||||
items := htmlutil.FindAll(root, s.ChapterEntrySelector())
|
||||
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
|
||||
if chapterList == nil {
|
||||
break
|
||||
}
|
||||
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
|
||||
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
|
||||
for _, item := range items {
|
||||
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
continue
|
||||
}
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
||||
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
|
||||
if href == "" {
|
||||
continue
|
||||
@@ -276,8 +316,15 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
})
|
||||
}
|
||||
|
||||
// Next page.
|
||||
nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
|
||||
s.log.Debug("chapter list page parsed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"chapters_on_page", len(items),
|
||||
"total_refs_so_far", len(refs),
|
||||
)
|
||||
|
||||
// Next page: <a class="next" href="...">
|
||||
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
|
||||
if nextHref == "" {
|
||||
break
|
||||
}
|
||||
@@ -290,38 +337,105 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
|
||||
|
||||
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) ChapterTextSelector() scraper.Selector {
|
||||
// <div id="chapter-container"> or <div class="chapter-content">
|
||||
return scraper.Selector{Tag: "div", ID: "chapter-container"}
|
||||
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
|
||||
// exponentially between retries. Only errors that look like transient Browserless
|
||||
// 5xx responses (navigation timeouts, etc.) are retried; context cancellation and
|
||||
// permanent errors are returned immediately.
|
||||
func retryGetContent(
|
||||
ctx context.Context,
|
||||
log *slog.Logger,
|
||||
client browser.BrowserClient,
|
||||
req browser.ContentRequest,
|
||||
maxAttempts int,
|
||||
baseDelay time.Duration,
|
||||
) (string, error) {
|
||||
var lastErr error
|
||||
delay := baseDelay
|
||||
for attempt := 1; attempt <= maxAttempts; attempt++ {
|
||||
html, err := client.GetContent(ctx, req)
|
||||
if err == nil {
|
||||
return html, nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// Stop immediately on context cancellation.
|
||||
if ctx.Err() != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Only retry on Browserless 5xx responses.
|
||||
if !strings.Contains(err.Error(), "unexpected status 5") {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if attempt < maxAttempts {
|
||||
log.Warn("chapter fetch failed, retrying",
|
||||
"url", req.URL,
|
||||
"attempt", attempt,
|
||||
"max_attempts", maxAttempts,
|
||||
"retry_in", delay,
|
||||
"err", err,
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case <-time.After(delay):
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
}
|
||||
return "", lastErr
|
||||
}
|
||||
|
||||
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
s.log.Debug("chapter text fetch starting",
|
||||
"chapter", ref.Number,
|
||||
"title", ref.Title,
|
||||
"payload_url", ref.URL,
|
||||
"payload_wait_selector", "#content",
|
||||
"payload_wait_selector_timeout_ms", 75000,
|
||||
"payload_wait_for_timeout_ms", 75000,
|
||||
)
|
||||
|
||||
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
|
||||
URL: ref.URL,
|
||||
WaitFor: "#chapter-container",
|
||||
WaitForTimeout: 15000,
|
||||
RejectResources: true,
|
||||
})
|
||||
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 75000},
|
||||
WaitForTimeout: 75000,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
}, 9, 6*time.Second)
|
||||
if err != nil {
|
||||
s.log.Debug("chapter text fetch failed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"err", err,
|
||||
)
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
|
||||
}
|
||||
s.log.Debug("chapter text fetch completed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"response_bytes", len(raw),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
|
||||
}
|
||||
|
||||
container := htmlutil.FindFirst(root, s.ChapterTextSelector())
|
||||
// <div id="content">…</div>
|
||||
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
|
||||
if container == nil {
|
||||
// Fallback: try class-based selector.
|
||||
container = htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "chapter-content"})
|
||||
}
|
||||
if container == nil {
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d: content container not found in %s", ref.Number, ref.URL)
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
|
||||
}
|
||||
|
||||
text := htmlutil.NodeToMarkdown(container)
|
||||
|
||||
s.log.Debug("chapter text parsed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"text_bytes", len(text),
|
||||
)
|
||||
|
||||
return scraper.Chapter{
|
||||
Ref: ref,
|
||||
Text: text,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
@@ -150,13 +151,20 @@ func InnerHTML(n *html.Node) string {
|
||||
|
||||
// NodeToMarkdown converts the children of an HTML node to a plain-text/Markdown
|
||||
// representation suitable for chapter storage. Block elements become newlines;
|
||||
// inline elements are inlined.
|
||||
// inline elements are inlined. Runs of more than one blank line are collapsed
|
||||
// to a single blank line.
|
||||
func NodeToMarkdown(n *html.Node) string {
|
||||
var sb strings.Builder
|
||||
nodeToMD(n, &sb)
|
||||
return strings.TrimSpace(sb.String())
|
||||
// Collapse 3+ consecutive newlines (i.e. more than one blank line) to 2.
|
||||
out := multiBlankLine.ReplaceAllString(sb.String(), "\n\n")
|
||||
return strings.TrimSpace(out)
|
||||
}
|
||||
|
||||
// multiBlankLine matches three or more consecutive newline characters
|
||||
// (any mix of \n and surrounding whitespace-only lines).
|
||||
var multiBlankLine = regexp.MustCompile(`\n(\s*\n){2,}`)
|
||||
|
||||
var blockElements = map[string]bool{
|
||||
"p": true, "div": true, "br": true, "h1": true, "h2": true,
|
||||
"h3": true, "h4": true, "h5": true, "h6": true, "li": true,
|
||||
|
||||
@@ -81,18 +81,6 @@ type Selector struct {
|
||||
// CatalogueProvider can enumerate every novel available on a source site.
|
||||
// It handles pagination transparently and streams CatalogueEntry values.
|
||||
type CatalogueProvider interface {
|
||||
// CatalogueURL returns the root URL of the catalogue listing.
|
||||
CatalogueURL() string
|
||||
|
||||
// EntriesSelector returns the selector that matches each novel card / row
|
||||
// in the catalogue listing page.
|
||||
EntriesSelector() Selector
|
||||
|
||||
// NextPageSelector returns the selector for the "next page" link.
|
||||
// If the current page has no next page the implementation must return
|
||||
// ("", nil) from ScrapeNextPage.
|
||||
NextPageSelector() Selector
|
||||
|
||||
// ScrapeCatalogue pages through the entire catalogue, sending
|
||||
// CatalogueEntry values to the returned channel. The channel is closed
|
||||
// when all pages have been scraped or ctx is cancelled.
|
||||
@@ -103,25 +91,12 @@ type CatalogueProvider interface {
|
||||
|
||||
// MetadataProvider can extract structured book metadata from a novel's landing page.
|
||||
type MetadataProvider interface {
|
||||
// MetadataSelectors returns a map of field name → Selector used to
|
||||
// locate each metadata element on the book page.
|
||||
// Required keys: "title", "author".
|
||||
// Optional keys: "cover", "status", "genres", "summary", "total_chapters".
|
||||
MetadataSelectors() map[string]Selector
|
||||
|
||||
// ScrapeMetadata fetches and parses the metadata for the book at bookURL.
|
||||
ScrapeMetadata(ctx context.Context, bookURL string) (BookMeta, error)
|
||||
}
|
||||
|
||||
// ChapterListProvider can enumerate all chapters of a book from the chapter-list page.
|
||||
type ChapterListProvider interface {
|
||||
// ChaptersURL derives the chapter-list URL from a book landing-page URL.
|
||||
ChaptersURL(bookURL string) string
|
||||
|
||||
// ChapterEntrySelector returns the selector that matches each chapter row
|
||||
// in the chapter list page.
|
||||
ChapterEntrySelector() Selector
|
||||
|
||||
// ScrapeChapterList returns all chapter references for a book, ordered
|
||||
// by chapter number ascending.
|
||||
ScrapeChapterList(ctx context.Context, bookURL string) ([]ChapterRef, error)
|
||||
@@ -129,9 +104,6 @@ type ChapterListProvider interface {
|
||||
|
||||
// ChapterTextProvider can extract the readable text from a single chapter page.
|
||||
type ChapterTextProvider interface {
|
||||
// ChapterTextSelector returns the selector that wraps the chapter body.
|
||||
ChapterTextSelector() Selector
|
||||
|
||||
// ScrapeChapterText fetches chapterURL and returns the chapter text as Markdown.
|
||||
ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error)
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
|
||||
"github.com/libnovel/scraper/internal/orchestrator"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
// Server wraps an HTTP mux with the scraping endpoints.
|
||||
@@ -26,6 +27,7 @@ type Server struct {
|
||||
oCfg orchestrator.Config
|
||||
novel scraper.NovelScraper
|
||||
log *slog.Logger
|
||||
writer *writer.Writer
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
}
|
||||
@@ -37,6 +39,7 @@ func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log
|
||||
oCfg: oCfg,
|
||||
novel: novel,
|
||||
log: log,
|
||||
writer: writer.New(oCfg.StaticRoot),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +50,12 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
|
||||
mux.HandleFunc("GET /health", s.handleHealth)
|
||||
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
|
||||
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
|
||||
// UI routes
|
||||
mux.HandleFunc("GET /", s.handleHome)
|
||||
mux.HandleFunc("GET /books/{slug}", s.handleBook)
|
||||
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
|
||||
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
|
||||
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: s.addr,
|
||||
|
||||
@@ -19,6 +19,8 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
@@ -113,7 +115,101 @@ func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ─── Path helpers ─────────────────────────────────────────────────────────────
|
||||
// ─── Catalogue helpers ────────────────────────────────────────────────────────
|
||||
|
||||
// ListBooks returns metadata for every book that has a metadata.yaml under root.
|
||||
// Books with unreadable metadata files are silently skipped.
|
||||
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
|
||||
entries, err := os.ReadDir(w.root)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("writer: list books: %w", err)
|
||||
}
|
||||
var books []scraper.BookMeta
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
meta, ok, _ := w.ReadMetadata(e.Name())
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
books = append(books, meta)
|
||||
}
|
||||
sort.Slice(books, func(i, j int) bool {
|
||||
return books[i].Title < books[j].Title
|
||||
})
|
||||
return books, nil
|
||||
}
|
||||
|
||||
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
|
||||
type ChapterInfo struct {
|
||||
Number int
|
||||
Title string // first line of the markdown file (without the leading "# ")
|
||||
}
|
||||
|
||||
// ListChapters returns all chapters on disk for slug, sorted by number.
|
||||
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
|
||||
bookDir := w.bookDir(slug)
|
||||
var chapters []ChapterInfo
|
||||
|
||||
// Walk vol-*/range-*/ directories.
|
||||
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("writer: list chapters glob: %w", err)
|
||||
}
|
||||
for _, vd := range volDirs {
|
||||
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
|
||||
for _, rd := range rangeDirs {
|
||||
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
|
||||
for _, f := range files {
|
||||
base := filepath.Base(f) // chapter-N.md
|
||||
numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
|
||||
n, err := strconv.Atoi(numStr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
title := chapterTitle(f, n)
|
||||
chapters = append(chapters, ChapterInfo{Number: n, Title: title})
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Slice(chapters, func(i, j int) bool {
|
||||
return chapters[i].Number < chapters[j].Number
|
||||
})
|
||||
return chapters, nil
|
||||
}
|
||||
|
||||
// chapterTitle reads the first non-empty line of a markdown file and strips
|
||||
// the leading "# " heading marker. Falls back to "Chapter N".
|
||||
func chapterTitle(path string, n int) string {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Sprintf("Chapter %d", n)
|
||||
}
|
||||
for _, line := range strings.SplitN(string(data), "\n", 10) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
return strings.TrimPrefix(line, "# ")
|
||||
}
|
||||
return fmt.Sprintf("Chapter %d", n)
|
||||
}
|
||||
|
||||
// ReadChapter returns the raw markdown content for chapter number n of slug.
|
||||
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
|
||||
// Reconstruct path using the same bucketing formula as chapterPath.
|
||||
ref := scraper.ChapterRef{Number: n, Volume: 0}
|
||||
path := w.chapterPath(slug, ref)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// bookDir returns the root directory for a book slug.
|
||||
func (w *Writer) bookDir(slug string) string {
|
||||
|
||||
Reference in New Issue
Block a user