Files
libnovel/scraper/internal/browser/content_scrape.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

196 lines
5.5 KiB
Go

package browser
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
)
// Config holds the connection parameters for a Browserless instance.
type Config struct {
// BaseURL is the HTTP base URL, e.g. "http://localhost:3000".
BaseURL string
// Token is the optional API token (BROWSERLESS_TOKEN env var).
Token string
// Timeout is the per-request HTTP timeout; defaults to 60 s.
Timeout time.Duration
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
// Browserless. When all slots are occupied new calls block until one
// completes (or ctx is cancelled). 0 means no limit.
MaxConcurrent int
}
// makeSem returns a buffered channel used as a counting semaphore.
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
func makeSem(n int) chan struct{} {
if n <= 0 {
return nil
}
return make(chan struct{}, n)
}
// acquire takes one slot from sem. It returns an error if ctx is cancelled
// before a slot becomes available. If sem is nil it returns immediately.
func acquire(ctx context.Context, sem chan struct{}) error {
if sem == nil {
return nil
}
select {
case sem <- struct{}{}:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// release frees the slot previously obtained by acquire.
// If sem is nil it is a no-op.
func release(sem chan struct{}) {
if sem != nil {
<-sem
}
}
// contentClient implements BrowserClient using the /content endpoint.
type contentClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewContentClient returns a BrowserClient that uses POST /content.
func NewContentClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 90 * time.Second
}
return &contentClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *contentClient) Strategy() Strategy { return StrategyContent }
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
if err := acquire(ctx, c.sem); err != nil {
return "", fmt.Errorf("content: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return "", fmt.Errorf("content: marshal request: %w", err)
}
url := c.cfg.BaseURL + "/content"
if c.cfg.Token != "" {
url += "?token=" + c.cfg.Token
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return "", fmt.Errorf("content: build request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.http.Do(httpReq)
if err != nil {
return "", fmt.Errorf("content: do request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
b, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
}
raw, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("content: read body: %w", err)
}
return string(raw), nil
}
func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
}
func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
return fmt.Errorf("content client does not support CDP; use NewCDPClient")
}
// ─── /scrape client ───────────────────────────────────────────────────────────
type scrapeClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
func NewScrapeClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 90 * time.Second
}
return &scrapeClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }
func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
}
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
if err := acquire(ctx, c.sem); err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
}
url := c.cfg.BaseURL + "/scrape"
if c.cfg.Token != "" {
url += "?token=" + c.cfg.Token
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.http.Do(httpReq)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
b, _ := io.ReadAll(resp.Body)
return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
}
var result ScrapeResponse
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
}
return result, nil
}
func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
}