Files
libnovel/scraper/internal/browser/interface.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

121 lines
4.6 KiB
Go

// Package browser defines the BrowserClient interface and helper types for
// communicating with a Browserless instance.
package browser
import "context"
// Strategy selects which Browserless API endpoint / protocol to use.
type Strategy string
const (
// StrategyContent uses the POST /content endpoint, which returns the final
// rendered HTML of the page. Fastest; suitable for most JS-rendered sites.
StrategyContent Strategy = "content"
// StrategyScrape uses the POST /scrape endpoint, which accepts a list of
// CSS selectors and returns structured JSON. Good when you know exactly
// which elements you need.
StrategyScrape Strategy = "scrape"
// StrategyCDP uses the WebSocket /devtools/browser endpoint (Chrome
// DevTools Protocol). Most powerful; required for complex interactions
// (clicking, scrolling, waiting for network idle, etc.).
StrategyCDP Strategy = "cdp"
// StrategyDirect uses a plain HTTP client to fetch HTML directly.
// Suitable for sites that don't require JavaScript rendering.
StrategyDirect Strategy = "direct"
)
// WaitForSelector describes the waitForSelector option sent to Browserless.
type WaitForSelector struct {
Selector string `json:"selector"`
Timeout int `json:"timeout,omitempty"` // ms
}
// GotoOptions controls page navigation behavior.
type GotoOptions struct {
Timeout int `json:"timeout,omitempty"` // ms
WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load"
}
// ContentRequest is the body sent to POST /content.
type ContentRequest struct {
URL string `json:"url"`
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
BestAttempt bool `json:"bestAttempt,omitempty"` // return partial content on timeout/error
}
// ScrapeElement is one element descriptor inside a ScrapeRequest.
type ScrapeElement struct {
Selector string `json:"selector"`
Timeout int `json:"timeout,omitempty"` // ms
}
// ScrapeRequest is the body sent to POST /scrape.
type ScrapeRequest struct {
URL string `json:"url"`
Elements []ScrapeElement `json:"elements"`
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
}
// ScrapeResult is one entry in the response from POST /scrape.
type ScrapeResult struct {
Selector string `json:"selector"`
Results []ScrapeElement `json:"results"`
}
// ScrapeAttribute holds a single attribute value from a scraped element.
type ScrapeAttribute struct {
Name string `json:"name"`
Value string `json:"value"`
}
// ScrapedElement is one item inside ScrapeResult.Results.
type ScrapedElement struct {
Text string `json:"text"`
Attributes []ScrapeAttribute `json:"attributes"`
}
// ScrapeResponse is the top-level response from POST /scrape.
type ScrapeResponse struct {
Data []ScrapeResult `json:"data"`
}
// BrowserClient is an abstraction over the three Browserless API strategies.
// Callers choose the strategy best suited to the target site; the interface
// signature is identical regardless of strategy.
type BrowserClient interface {
// Strategy returns the strategy this client uses.
Strategy() Strategy
// GetContent fetches the fully-rendered HTML of url using the /content
// endpoint. Only meaningful when Strategy() == StrategyContent.
GetContent(ctx context.Context, req ContentRequest) (string, error)
// ScrapePage calls the /scrape endpoint and returns structured data.
// Only meaningful when Strategy() == StrategyScrape.
ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error)
// CDPSession opens a CDP WebSocket session and calls fn with the raw
// WebSocket connection. Only meaningful when Strategy() == StrategyCDP.
// The session is closed when fn returns.
CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error
}
// CDPSessionFunc is the callback invoked inside a CDP session.
// conn is a live *websocket.Conn connected to a Browserless CDP endpoint.
type CDPSessionFunc func(ctx context.Context, conn CDPConn) error
// CDPConn is the minimal interface the orchestrator needs over a CDP WebSocket.
type CDPConn interface {
// Send sends a raw CDP command (JSON-encoded) and returns the response.
Send(ctx context.Context, method string, params map[string]any) (map[string]any, error)
// Close closes the underlying connection.
Close() error
}