feat: add exponential backoff, some UI elements to see the resut of a scrape

2026-02-26 18:51:32 +05:00
parent d68ea71239
commit e6e6f7dc4d
12 changed files with 462 additions and 153 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,28 @@
+# ── Compiled binary ────────────────────────────────────────────────────────────
+/scraper
+/scraper-*
+
+# ── Go toolchain ───────────────────────────────────────────────────────────────
+*.test
+*.out
+/vendor/
+/dist/
+
+# ── Scraped output (large, machine-generated) ──────────────────────────────────
+
+/static/books
+# ── Environment & secrets ──────────────────────────────────────────────────────
+.env
+.env.*
+!.env.example
+
+# ── OS artefacts ───────────────────────────────────────────────────────────────
+.DS_Store
+Thumbs.db
+
+# ── Editor / IDE ───────────────────────────────────────────────────────────────
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
--- a/scraper/cmd/scraper/main.go
+++ b/scraper/cmd/scraper/main.go
@@ -13,9 +13,11 @@
 //	BROWSERLESS_URL          Browserless base URL           (default: http://localhost:3000)
 //	BROWSERLESS_TOKEN        Browserless API token          (default: "")
 //	BROWSERLESS_STRATEGY     content | scrape | cdp         (default: content)
+//	BROWSERLESS_MAX_CONCURRENT  Max simultaneous browser sessions  (default: 5)
 //	SCRAPER_WORKERS          Chapter goroutine count        (default: NumCPU)
 //	SCRAPER_STATIC_ROOT      Output directory               (default: ./static/books)
 //	SCRAPER_HTTP_ADDR        HTTP listen address            (default: :8080)
+//	LOG_LEVEL                debug | info | warn | error    (default: info)
 package main

 import (
@@ -36,8 +38,14 @@ import (
 )

 func main() {
+	logLevel := slog.LevelInfo
+	if v := os.Getenv("LOG_LEVEL"); v != "" {
+		if err := logLevel.UnmarshalText([]byte(v)); err != nil {
+			fmt.Fprintf(os.Stderr, "invalid LOG_LEVEL %q, using info\n", v)
+		}
+	}
 	log := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
-		Level: slog.LevelInfo,
+		Level: logLevel,
 	}))

 	if err := run(log); err != nil {
@@ -59,6 +67,12 @@ func run(log *slog.Logger) error {
 		BaseURL: envOr("BROWSERLESS_URL", "http://localhost:3000"),
 		Token:   envOr("BROWSERLESS_TOKEN", ""),
 	}
+	browserCfg.MaxConcurrent = 5
+	if s := os.Getenv("BROWSERLESS_MAX_CONCURRENT"); s != "" {
+		if n, err := strconv.Atoi(s); err == nil && n > 0 {
+			browserCfg.MaxConcurrent = n
+		}
+	}

 	strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyContent))))
 	bc := newBrowserClient(strategy, browserCfg)
@@ -93,6 +107,7 @@ func run(log *slog.Logger) error {
 		log.Info("starting one-shot scrape",
 			"strategy", strategy,
 			"workers", workers,
+			"max_concurrent", browserCfg.MaxConcurrent,
 			"static_root", oCfg.StaticRoot,
 			"single_book", oCfg.SingleBookURL,
 		)
@@ -105,6 +120,7 @@ func run(log *slog.Logger) error {
 			"addr", addr,
 			"strategy", strategy,
 			"workers", workers,
+			"max_concurrent", browserCfg.MaxConcurrent,
 		)
 		srv := server.New(addr, oCfg, nf, log)
 		return srv.ListenAndServe(ctx)
@@ -143,8 +159,10 @@ Environment variables:
  BROWSERLESS_URL             Browserless base URL      (default: http://localhost:3000)
  BROWSERLESS_TOKEN           API token                 (default: "")
  BROWSERLESS_STRATEGY        content | scrape | cdp    (default: content)
+  BROWSERLESS_MAX_CONCURRENT  Max simultaneous sessions (default: 5)
  SCRAPER_WORKERS             Chapter goroutines        (default: NumCPU = %d)
  SCRAPER_STATIC_ROOT         Output directory          (default: ./static/books)
  SCRAPER_HTTP_ADDR           HTTP listen address       (default: :8080)
+  LOG_LEVEL                   debug|info|warn|error     (default: info)
 `, runtime.NumCPU())
 }
--- a/scraper/go.mod
+++ b/scraper/go.mod
@@ -4,6 +4,7 @@ go 1.25.0

 require (
 	github.com/gorilla/websocket v1.5.3 // indirect
+	github.com/yuin/goldmark v1.7.16 // indirect
 	golang.org/x/net v0.51.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/scraper/go.sum
+++ b/scraper/go.sum
@@ -1,5 +1,7 @@
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE=
+github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
 golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
 golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
--- a/scraper/internal/browser/cdp.go
+++ b/scraper/internal/browser/cdp.go
@@ -15,6 +15,7 @@ import (
 // cdpClient implements BrowserClient using the CDP WebSocket endpoint.
 type cdpClient struct {
 	cfg Config
+	sem chan struct{}
 }

 // NewCDPClient returns a BrowserClient that uses CDP WebSocket sessions.
@@ -22,7 +23,7 @@ func NewCDPClient(cfg Config) BrowserClient {
 	if cfg.Timeout == 0 {
 		cfg.Timeout = 60 * time.Second
 	}
-	return &cdpClient{cfg: cfg}
+	return &cdpClient{cfg: cfg, sem: makeSem(cfg.MaxConcurrent)}
 }

 func (c *cdpClient) Strategy() Strategy { return StrategyCDP }
@@ -38,6 +39,11 @@ func (c *cdpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeRespon
 // CDPSession opens a WebSocket to the Browserless /devtools/browser endpoint,
 // navigates to pageURL, and invokes fn with a live CDPConn.
 func (c *cdpClient) CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error {
+	if err := acquire(ctx, c.sem); err != nil {
+		return fmt.Errorf("cdp: semaphore: %w", err)
+	}
+	defer release(c.sem)
+
 	// Build WebSocket URL: ws://host:port/devtools/browser?token=...&url=...
 	wsURL := strings.Replace(c.cfg.BaseURL, "http://", "ws://", 1)
 	wsURL = strings.Replace(wsURL, "https://", "wss://", 1)
--- a/scraper/internal/browser/content_scrape.go
+++ b/scraper/internal/browser/content_scrape.go
@@ -18,12 +18,48 @@ type Config struct {
 	Token string
 	// Timeout is the per-request HTTP timeout; defaults to 60 s.
 	Timeout time.Duration
+	// MaxConcurrent caps the number of simultaneous in-flight requests sent to
+	// Browserless. When all slots are occupied new calls block until one
+	// completes (or ctx is cancelled). 0 means no limit.
+	MaxConcurrent int
+}
+
+// makeSem returns a buffered channel used as a counting semaphore.
+// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
+func makeSem(n int) chan struct{} {
+	if n <= 0 {
+		return nil
+	}
+	return make(chan struct{}, n)
+}
+
+// acquire takes one slot from sem. It returns an error if ctx is cancelled
+// before a slot becomes available. If sem is nil it returns immediately.
+func acquire(ctx context.Context, sem chan struct{}) error {
+	if sem == nil {
+		return nil
+	}
+	select {
+	case sem <- struct{}{}:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// release frees the slot previously obtained by acquire.
+// If sem is nil it is a no-op.
+func release(sem chan struct{}) {
+	if sem != nil {
+		<-sem
+	}
 }

 // contentClient implements BrowserClient using the /content endpoint.
 type contentClient struct {
 	cfg  Config
 	http *http.Client
+	sem  chan struct{}
 }

 // NewContentClient returns a BrowserClient that uses POST /content.
@@ -34,12 +70,18 @@ func NewContentClient(cfg Config) BrowserClient {
 	return &contentClient{
 		cfg:  cfg,
 		http: &http.Client{Timeout: cfg.Timeout},
+		sem:  makeSem(cfg.MaxConcurrent),
 	}
 }

 func (c *contentClient) Strategy() Strategy { return StrategyContent }

 func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
+	if err := acquire(ctx, c.sem); err != nil {
+		return "", fmt.Errorf("content: semaphore: %w", err)
+	}
+	defer release(c.sem)
+
 	body, err := json.Marshal(req)
 	if err != nil {
 		return "", fmt.Errorf("content: marshal request: %w", err)
@@ -87,6 +129,7 @@ func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc
 type scrapeClient struct {
 	cfg  Config
 	http *http.Client
+	sem  chan struct{}
 }

 // NewScrapeClient returns a BrowserClient that uses POST /scrape.
@@ -97,6 +140,7 @@ func NewScrapeClient(cfg Config) BrowserClient {
 	return &scrapeClient{
 		cfg:  cfg,
 		http: &http.Client{Timeout: cfg.Timeout},
+		sem:  makeSem(cfg.MaxConcurrent),
 	}
 }

@@ -107,6 +151,11 @@ func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string,
 }

 func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
+	if err := acquire(ctx, c.sem); err != nil {
+		return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
+	}
+	defer release(c.sem)
+
 	body, err := json.Marshal(req)
 	if err != nil {
 		return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
--- a/scraper/internal/browser/interface.go
+++ b/scraper/internal/browser/interface.go
@@ -23,12 +23,18 @@ const (
 	StrategyCDP Strategy = "cdp"
 )

+// WaitForSelector describes the waitForSelector option sent to Browserless.
+type WaitForSelector struct {
+	Selector string `json:"selector"`
+	Timeout  int    `json:"timeout,omitempty"` // ms
+}
+
 // ContentRequest is the body sent to POST /content.
 type ContentRequest struct {
 	URL                 string           `json:"url"`
-	WaitFor         string `json:"waitForSelector,omitempty"`
+	WaitFor             *WaitForSelector `json:"waitForSelector,omitempty"`
 	WaitForTimeout      int              `json:"waitForTimeout,omitempty"`      // ms
-	RejectResources bool   `json:"rejectResources,omitempty"`
+	RejectResourceTypes []string         `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
 }

 // ScrapeElement is one element descriptor inside a ScrapeRequest.
@@ -41,7 +47,7 @@ type ScrapeElement struct {
 type ScrapeRequest struct {
 	URL      string           `json:"url"`
 	Elements []ScrapeElement  `json:"elements"`
-	WaitFor  string          `json:"waitForSelector,omitempty"`
+	WaitFor  *WaitForSelector `json:"waitForSelector,omitempty"`
 }

 // ScrapeResult is one entry in the response from POST /scrape.
--- a/scraper/internal/novelfire/scraper.go
+++ b/scraper/internal/novelfire/scraper.go
@@ -15,6 +15,7 @@ import (
 	"net/url"
 	"strconv"
 	"strings"
+	"time"

 	"github.com/libnovel/scraper/internal/browser"
 	"github.com/libnovel/scraper/internal/scraper"
@@ -26,6 +27,27 @@ const (
 	cataloguePath = "/genre-all/sort-new/status-all/all-novel"
 )

+// rejectResourceTypes lists Browserless resource types to block on every request.
+// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
+// Everything else is safe to drop for HTML-only scraping.
+var rejectResourceTypes = []string{
+	"cspviolationreport",
+	"eventsource",
+	"fedcm",
+	"font",
+	"image",
+	"manifest",
+	"media",
+	"other",
+	"ping",
+	"prefetch",
+	"preflight",
+	"signedexchange",
+	"stylesheet",
+	"texttrack",
+	"websocket",
+}
+
 // Scraper is the novelfire.net implementation of scraper.NovelScraper.
 // It uses the /content strategy by default (rendered HTML via Browserless).
 type Scraper struct {
@@ -46,20 +68,6 @@ func (s *Scraper) SourceName() string { return "novelfire.net" }

 // ─── CatalogueProvider ───────────────────────────────────────────────────────

-func (s *Scraper) CatalogueURL() string {
-	return baseURL + cataloguePath
-}
-
-func (s *Scraper) EntriesSelector() scraper.Selector {
-	// Each novel card: <div class="novel-item">
-	return scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true}
-}
-
-func (s *Scraper) NextPageSelector() scraper.Selector {
-	// <a class="next" href="...">
-	return scraper.Selector{Tag: "a", Class: "next", Attr: "href"}
-}
-
 // ScrapeCatalogue streams all CatalogueEntry values across all pages.
 func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
 	entries := make(chan scraper.CatalogueEntry, 64)
@@ -69,7 +77,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
 		defer close(entries)
 		defer close(errs)

-		pageURL := s.CatalogueURL()
+		pageURL := baseURL + cataloguePath
 		page := 1

 		for pageURL != "" {
@@ -80,17 +88,34 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
 			}

 			s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
+			s.log.Debug("catalogue page fetch starting",
+				"page", page,
+				"payload_url", pageURL,
+				"payload_wait_selector", ".novel-item",
+				"payload_wait_selector_timeout_ms", 10000,
+				"payload_wait_for_timeout_ms", 10000,
+			)

 			html, err := s.client.GetContent(ctx, browser.ContentRequest{
 				URL:                 pageURL,
-				WaitFor:         ".novel-item",
+				WaitFor:             &browser.WaitForSelector{Selector: ".novel-item", Timeout: 10000},
 				WaitForTimeout:      10000,
-				RejectResources: true,
+				RejectResourceTypes: rejectResourceTypes,
 			})
 			if err != nil {
+				s.log.Debug("catalogue page fetch failed",
+					"page", page,
+					"url", pageURL,
+					"err", err,
+				)
 				errs <- fmt.Errorf("catalogue page %d: %w", page, err)
 				return
 			}
+			s.log.Debug("catalogue page fetch completed",
+				"page", page,
+				"url", pageURL,
+				"response_bytes", len(html),
+			)

 			root, err := htmlutil.ParseHTML(html)
 			if err != nil {
@@ -98,8 +123,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
 				return
 			}

-			// Extract novel cards.
-			cards := htmlutil.FindAll(root, s.EntriesSelector())
+			// Extract novel cards: <div class="novel-item">
+			cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
 			if len(cards) == 0 {
 				s.log.Warn("no novel cards found, stopping pagination", "page", page)
 				return
@@ -107,8 +132,7 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue

 			for _, card := range cards {
 				// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
-				titleSel := scraper.Selector{Tag: "h3", Class: "novel-title"}
-				titleNode := htmlutil.FindFirst(card, titleSel)
+				titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})

 				var title, href string
 				if titleNode != nil {
@@ -122,7 +146,6 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
 					continue
 				}

-				// Resolve relative URL.
 				bookURL := resolveURL(baseURL, href)
 				select {
 				case <-ctx.Done():
@@ -131,8 +154,8 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue
 				}
 			}

-			// Find next page link.
-			nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
+			// Find next page link: <a class="next" href="...">
+			nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
 			if nextHref == "" {
 				break
 			}
@@ -146,64 +169,56 @@ func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.Catalogue

 // ─── MetadataProvider ────────────────────────────────────────────────────────

-func (s *Scraper) MetadataSelectors() map[string]scraper.Selector {
-	return map[string]scraper.Selector{
-		// <h1 class="novel-title">Title</h1>
-		"title": {Tag: "h1", Class: "novel-title"},
-		// <span class="author"><a>Author Name</a></span>
-		"author": {Tag: "span", Class: "author"},
-		// <img class="cover" src="...">
-		"cover": {Tag: "img", Class: "cover", Attr: "src"},
-		// <span class="status">Ongoing</span>
-		"status": {Tag: "span", Class: "status"},
-		// <div class="genres"><a>Tag1</a><a>Tag2</a>…</div>
-		"genres": {Tag: "div", Class: "genres", Multiple: true},
-		// <div class="summary"><p>...</p></div>
-		"summary": {Tag: "div", Class: "summary"},
-		// <span class="chapter-count">123 Chapters</span>
-		"total_chapters": {Tag: "span", Class: "chapter-count"},
-	}
-}
-
 func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
+	s.log.Debug("metadata fetch starting",
+		"payload_url", bookURL,
+		"payload_wait_selector", ".novel-title",
+		"payload_wait_selector_timeout_ms", 10000,
+		"payload_wait_for_timeout_ms", 10000,
+	)
+
 	raw, err := s.client.GetContent(ctx, browser.ContentRequest{
 		URL:                 bookURL,
-		WaitFor:         ".novel-title",
+		WaitFor:             &browser.WaitForSelector{Selector: ".novel-title", Timeout: 10000},
 		WaitForTimeout:      10000,
-		RejectResources: true,
+		RejectResourceTypes: rejectResourceTypes,
 	})
 	if err != nil {
+		s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
 		return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
 	}
+	s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))

 	root, err := htmlutil.ParseHTML(raw)
 	if err != nil {
 		return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
 	}

-	sels := s.MetadataSelectors()
+	// <h1 class="novel-title">Title</h1>
+	title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
+	// <span class="author"><a>Author Name</a></span>
+	author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
+	// <img class="cover" src="...">
+	cover := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "img", Class: "cover", Attr: "src"})
+	// <span class="status">Ongoing</span>
+	status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})

-	title := htmlutil.ExtractFirst(root, sels["title"])
-	author := htmlutil.ExtractFirst(root, sels["author"])
-	cover := htmlutil.ExtractFirst(root, sels["cover"])
-	status := htmlutil.ExtractFirst(root, sels["status"])
-
-	// Genres: all <a> tags inside the genres div.
-	genresNode := htmlutil.FindFirst(root, sels["genres"])
+	// Genres: all <a> tags inside <div class="genres">
+	genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
 	var genres []string
 	if genresNode != nil {
 		genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
 	}

-	summary := htmlutil.ExtractFirst(root, sels["summary"])
-
-	totalStr := htmlutil.ExtractFirst(root, sels["total_chapters"])
+	// <div class="summary"><p>...</p></div>
+	summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
+	// <span class="chapter-count">123 Chapters</span>
+	totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
 	totalChapters := parseChapterCount(totalStr)

-	// Derive slug from URL.
 	slug := slugFromURL(bookURL)

-	return scraper.BookMeta{
+	meta := scraper.BookMeta{
 		Slug:          slug,
 		Title:         title,
 		Author:        author,
@@ -213,23 +228,25 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.B
 		Summary:       summary,
 		TotalChapters: totalChapters,
 		SourceURL:     bookURL,
-	}, nil
+	}
+	s.log.Debug("metadata parsed",
+		"url", bookURL,
+		"slug", meta.Slug,
+		"title", meta.Title,
+		"author", meta.Author,
+		"status", meta.Status,
+		"genres", meta.Genres,
+		"total_chapters", meta.TotalChapters,
+	)
+	return meta, nil
 }

 // ─── ChapterListProvider ─────────────────────────────────────────────────────

-func (s *Scraper) ChaptersURL(bookURL string) string {
-	return strings.TrimRight(bookURL, "/") + "/chapters"
-}
-
-func (s *Scraper) ChapterEntrySelector() scraper.Selector {
-	// <li class="chapter-item"><a href="/book/slug/chapter-1">Chapter 1: Title</a></li>
-	return scraper.Selector{Tag: "li", Class: "chapter-item", Multiple: true}
-}
-
 func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
 	var refs []scraper.ChapterRef
-	pageURL := s.ChaptersURL(bookURL)
+	// Chapter list URL: {bookURL}/chapters
+	pageURL := strings.TrimRight(bookURL, "/") + "/chapters"
 	page := 1

 	for pageURL != "" {
@@ -241,28 +258,51 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra

 		s.log.Info("scraping chapter list", "page", page, "url", pageURL)

+		s.log.Debug("chapter list fetch starting",
+			"page", page,
+			"payload_url", pageURL,
+			"payload_wait_selector", ".chapter-list",
+			"payload_wait_selector_timeout_ms", 10000,
+			"payload_wait_for_timeout_ms", 10000,
+		)
+
 		raw, err := s.client.GetContent(ctx, browser.ContentRequest{
 			URL:                 pageURL,
-			WaitFor:         ".chapter-item",
+			WaitFor:             &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 10000},
 			WaitForTimeout:      10000,
-			RejectResources: true,
+			RejectResourceTypes: rejectResourceTypes,
 		})
 		if err != nil {
+			s.log.Debug("chapter list fetch failed",
+				"page", page,
+				"url", pageURL,
+				"err", err,
+			)
 			return refs, fmt.Errorf("chapter list page %d: %w", page, err)
 		}
+		s.log.Debug("chapter list fetch completed",
+			"page", page,
+			"url", pageURL,
+			"response_bytes", len(raw),
+		)

 		root, err := htmlutil.ParseHTML(raw)
 		if err != nil {
 			return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
 		}

-		items := htmlutil.FindAll(root, s.ChapterEntrySelector())
+		chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
+		if chapterList == nil {
+			break
+		}
+		// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
+		items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
 		for _, item := range items {
-			linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a", Attr: "href"})
+			linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
 			if linkNode == nil {
 				continue
 			}
-			href := htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
+			href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
 			chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
 			if href == "" {
 				continue
@@ -276,8 +316,15 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra
 			})
 		}

-		// Next page.
-		nextHref := htmlutil.ExtractFirst(root, s.NextPageSelector())
+		s.log.Debug("chapter list page parsed",
+			"page", page,
+			"url", pageURL,
+			"chapters_on_page", len(items),
+			"total_refs_so_far", len(refs),
+		)
+
+		// Next page: <a class="next" href="...">
+		nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
 		if nextHref == "" {
 			break
 		}
@@ -290,38 +337,105 @@ func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scra

 // ─── ChapterTextProvider ─────────────────────────────────────────────────────

-func (s *Scraper) ChapterTextSelector() scraper.Selector {
-	// <div id="chapter-container"> or <div class="chapter-content">
-	return scraper.Selector{Tag: "div", ID: "chapter-container"}
+// retryGetContent calls client.GetContent up to maxAttempts times, backing off
+// exponentially between retries. Only errors that look like transient Browserless
+// 5xx responses (navigation timeouts, etc.) are retried; context cancellation and
+// permanent errors are returned immediately.
+func retryGetContent(
+	ctx context.Context,
+	log *slog.Logger,
+	client browser.BrowserClient,
+	req browser.ContentRequest,
+	maxAttempts int,
+	baseDelay time.Duration,
+) (string, error) {
+	var lastErr error
+	delay := baseDelay
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		html, err := client.GetContent(ctx, req)
+		if err == nil {
+			return html, nil
+		}
+		lastErr = err
+
+		// Stop immediately on context cancellation.
+		if ctx.Err() != nil {
+			return "", err
+		}
+
+		// Only retry on Browserless 5xx responses.
+		if !strings.Contains(err.Error(), "unexpected status 5") {
+			return "", err
+		}
+
+		if attempt < maxAttempts {
+			log.Warn("chapter fetch failed, retrying",
+				"url", req.URL,
+				"attempt", attempt,
+				"max_attempts", maxAttempts,
+				"retry_in", delay,
+				"err", err,
+			)
+			select {
+			case <-ctx.Done():
+				return "", ctx.Err()
+			case <-time.After(delay):
+			}
+			delay *= 2
+		}
+	}
+	return "", lastErr
 }

 func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
-	raw, err := s.client.GetContent(ctx, browser.ContentRequest{
+	s.log.Debug("chapter text fetch starting",
+		"chapter", ref.Number,
+		"title", ref.Title,
+		"payload_url", ref.URL,
+		"payload_wait_selector", "#content",
+		"payload_wait_selector_timeout_ms", 75000,
+		"payload_wait_for_timeout_ms", 75000,
+	)
+
+	raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
 		URL:                 ref.URL,
-		WaitFor:         "#chapter-container",
-		WaitForTimeout:  15000,
-		RejectResources: true,
-	})
+		WaitFor:             &browser.WaitForSelector{Selector: "#content", Timeout: 75000},
+		WaitForTimeout:      75000,
+		RejectResourceTypes: rejectResourceTypes,
+	}, 9, 6*time.Second)
 	if err != nil {
+		s.log.Debug("chapter text fetch failed",
+			"chapter", ref.Number,
+			"url", ref.URL,
+			"err", err,
+		)
 		return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
 	}
+	s.log.Debug("chapter text fetch completed",
+		"chapter", ref.Number,
+		"url", ref.URL,
+		"response_bytes", len(raw),
+	)

 	root, err := htmlutil.ParseHTML(raw)
 	if err != nil {
 		return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
 	}

-	container := htmlutil.FindFirst(root, s.ChapterTextSelector())
+	// <div id="content">…</div>
+	container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
 	if container == nil {
-		// Fallback: try class-based selector.
-		container = htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "chapter-content"})
-	}
-	if container == nil {
-		return scraper.Chapter{}, fmt.Errorf("chapter %d: content container not found in %s", ref.Number, ref.URL)
+		return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
 	}

 	text := htmlutil.NodeToMarkdown(container)

+	s.log.Debug("chapter text parsed",
+		"chapter", ref.Number,
+		"url", ref.URL,
+		"text_bytes", len(text),
+	)
+
 	return scraper.Chapter{
 		Ref:  ref,
 		Text: text,
--- a/scraper/internal/scraper/htmlutil/htmlutil.go
+++ b/scraper/internal/scraper/htmlutil/htmlutil.go
@@ -3,6 +3,7 @@
 package htmlutil

 import (
+	"regexp"
 	"strings"

 	"github.com/libnovel/scraper/internal/scraper"
@@ -150,13 +151,20 @@ func InnerHTML(n *html.Node) string {

 // NodeToMarkdown converts the children of an HTML node to a plain-text/Markdown
 // representation suitable for chapter storage. Block elements become newlines;
-// inline elements are inlined.
+// inline elements are inlined. Runs of more than one blank line are collapsed
+// to a single blank line.
 func NodeToMarkdown(n *html.Node) string {
 	var sb strings.Builder
 	nodeToMD(n, &sb)
-	return strings.TrimSpace(sb.String())
+	// Collapse 3+ consecutive newlines (i.e. more than one blank line) to 2.
+	out := multiBlankLine.ReplaceAllString(sb.String(), "\n\n")
+	return strings.TrimSpace(out)
 }

+// multiBlankLine matches three or more consecutive newline characters
+// (any mix of \n and surrounding whitespace-only lines).
+var multiBlankLine = regexp.MustCompile(`\n(\s*\n){2,}`)
+
 var blockElements = map[string]bool{
 	"p": true, "div": true, "br": true, "h1": true, "h2": true,
 	"h3": true, "h4": true, "h5": true, "h6": true, "li": true,
--- a/scraper/internal/scraper/interfaces.go
+++ b/scraper/internal/scraper/interfaces.go
@@ -81,18 +81,6 @@ type Selector struct {
 // CatalogueProvider can enumerate every novel available on a source site.
 // It handles pagination transparently and streams CatalogueEntry values.
 type CatalogueProvider interface {
-	// CatalogueURL returns the root URL of the catalogue listing.
-	CatalogueURL() string
-
-	// EntriesSelector returns the selector that matches each novel card / row
-	// in the catalogue listing page.
-	EntriesSelector() Selector
-
-	// NextPageSelector returns the selector for the "next page" link.
-	// If the current page has no next page the implementation must return
-	// ("", nil) from ScrapeNextPage.
-	NextPageSelector() Selector
-
 	// ScrapeCatalogue pages through the entire catalogue, sending
 	// CatalogueEntry values to the returned channel. The channel is closed
 	// when all pages have been scraped or ctx is cancelled.
@@ -103,25 +91,12 @@ type CatalogueProvider interface {

 // MetadataProvider can extract structured book metadata from a novel's landing page.
 type MetadataProvider interface {
-	// MetadataSelectors returns a map of field name → Selector used to
-	// locate each metadata element on the book page.
-	// Required keys: "title", "author".
-	// Optional keys: "cover", "status", "genres", "summary", "total_chapters".
-	MetadataSelectors() map[string]Selector
-
 	// ScrapeMetadata fetches and parses the metadata for the book at bookURL.
 	ScrapeMetadata(ctx context.Context, bookURL string) (BookMeta, error)
 }

 // ChapterListProvider can enumerate all chapters of a book from the chapter-list page.
 type ChapterListProvider interface {
-	// ChaptersURL derives the chapter-list URL from a book landing-page URL.
-	ChaptersURL(bookURL string) string
-
-	// ChapterEntrySelector returns the selector that matches each chapter row
-	// in the chapter list page.
-	ChapterEntrySelector() Selector
-
 	// ScrapeChapterList returns all chapter references for a book, ordered
 	// by chapter number ascending.
 	ScrapeChapterList(ctx context.Context, bookURL string) ([]ChapterRef, error)
@@ -129,9 +104,6 @@ type ChapterListProvider interface {

 // ChapterTextProvider can extract the readable text from a single chapter page.
 type ChapterTextProvider interface {
-	// ChapterTextSelector returns the selector that wraps the chapter body.
-	ChapterTextSelector() Selector
-
 	// ScrapeChapterText fetches chapterURL and returns the chapter text as Markdown.
 	ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error)
 }
--- a/scraper/internal/server/server.go
+++ b/scraper/internal/server/server.go
@@ -18,6 +18,7 @@ import (

 	"github.com/libnovel/scraper/internal/orchestrator"
 	"github.com/libnovel/scraper/internal/scraper"
+	"github.com/libnovel/scraper/internal/writer"
 )

 // Server wraps an HTTP mux with the scraping endpoints.
@@ -26,6 +27,7 @@ type Server struct {
 	oCfg    orchestrator.Config
 	novel   scraper.NovelScraper
 	log     *slog.Logger
+	writer  *writer.Writer
 	mu      sync.Mutex
 	running bool
 }
@@ -37,6 +39,7 @@ func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log
 		oCfg:   oCfg,
 		novel:  novel,
 		log:    log,
+		writer: writer.New(oCfg.StaticRoot),
 	}
 }

@@ -47,6 +50,12 @@ func (s *Server) ListenAndServe(ctx context.Context) error {
 	mux.HandleFunc("GET /health", s.handleHealth)
 	mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
 	mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
+	// UI routes
+	mux.HandleFunc("GET /", s.handleHome)
+	mux.HandleFunc("GET /books/{slug}", s.handleBook)
+	mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
+	mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
+	mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)

 	srv := &http.Server{
 		Addr:         s.addr,
--- a/scraper/internal/writer/writer.go
+++ b/scraper/internal/writer/writer.go
@@ -19,6 +19,8 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"sort"
+	"strconv"
 	"strings"

 	"github.com/libnovel/scraper/internal/scraper"
@@ -113,7 +115,101 @@ func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
 	return nil
 }

-// ─── Path helpers ─────────────────────────────────────────────────────────────
+// ─── Catalogue helpers ────────────────────────────────────────────────────────
+
+// ListBooks returns metadata for every book that has a metadata.yaml under root.
+// Books with unreadable metadata files are silently skipped.
+func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
+	entries, err := os.ReadDir(w.root)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("writer: list books: %w", err)
+	}
+	var books []scraper.BookMeta
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		meta, ok, _ := w.ReadMetadata(e.Name())
+		if !ok {
+			continue
+		}
+		books = append(books, meta)
+	}
+	sort.Slice(books, func(i, j int) bool {
+		return books[i].Title < books[j].Title
+	})
+	return books, nil
+}
+
+// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
+type ChapterInfo struct {
+	Number int
+	Title  string // first line of the markdown file (without the leading "# ")
+}
+
+// ListChapters returns all chapters on disk for slug, sorted by number.
+func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
+	bookDir := w.bookDir(slug)
+	var chapters []ChapterInfo
+
+	// Walk vol-*/range-*/ directories.
+	volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
+	if err != nil {
+		return nil, fmt.Errorf("writer: list chapters glob: %w", err)
+	}
+	for _, vd := range volDirs {
+		rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
+		for _, rd := range rangeDirs {
+			files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
+			for _, f := range files {
+				base := filepath.Base(f) // chapter-N.md
+				numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
+				n, err := strconv.Atoi(numStr)
+				if err != nil {
+					continue
+				}
+				title := chapterTitle(f, n)
+				chapters = append(chapters, ChapterInfo{Number: n, Title: title})
+			}
+		}
+	}
+	sort.Slice(chapters, func(i, j int) bool {
+		return chapters[i].Number < chapters[j].Number
+	})
+	return chapters, nil
+}
+
+// chapterTitle reads the first non-empty line of a markdown file and strips
+// the leading "# " heading marker. Falls back to "Chapter N".
+func chapterTitle(path string, n int) string {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return fmt.Sprintf("Chapter %d", n)
+	}
+	for _, line := range strings.SplitN(string(data), "\n", 10) {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		return strings.TrimPrefix(line, "# ")
+	}
+	return fmt.Sprintf("Chapter %d", n)
+}
+
+// ReadChapter returns the raw markdown content for chapter number n of slug.
+func (w *Writer) ReadChapter(slug string, n int) (string, error) {
+	// Reconstruct path using the same bucketing formula as chapterPath.
+	ref := scraper.ChapterRef{Number: n, Volume: 0}
+	path := w.chapterPath(slug, ref)
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
+	}
+	return string(data), nil
+}

 // bookDir returns the root directory for a book slug.
 func (w *Writer) bookDir(slug string) string {