- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
121 lines
4.6 KiB
Go
121 lines
4.6 KiB
Go
// Package browser defines the BrowserClient interface and helper types for
|
|
// communicating with a Browserless instance.
|
|
package browser
|
|
|
|
import "context"
|
|
|
|
// Strategy selects which Browserless API endpoint / protocol to use.
|
|
type Strategy string
|
|
|
|
const (
|
|
// StrategyContent uses the POST /content endpoint, which returns the final
|
|
// rendered HTML of the page. Fastest; suitable for most JS-rendered sites.
|
|
StrategyContent Strategy = "content"
|
|
|
|
// StrategyScrape uses the POST /scrape endpoint, which accepts a list of
|
|
// CSS selectors and returns structured JSON. Good when you know exactly
|
|
// which elements you need.
|
|
StrategyScrape Strategy = "scrape"
|
|
|
|
// StrategyCDP uses the WebSocket /devtools/browser endpoint (Chrome
|
|
// DevTools Protocol). Most powerful; required for complex interactions
|
|
// (clicking, scrolling, waiting for network idle, etc.).
|
|
StrategyCDP Strategy = "cdp"
|
|
|
|
// StrategyDirect uses a plain HTTP client to fetch HTML directly.
|
|
// Suitable for sites that don't require JavaScript rendering.
|
|
StrategyDirect Strategy = "direct"
|
|
)
|
|
|
|
// WaitForSelector describes the waitForSelector option sent to Browserless.
|
|
type WaitForSelector struct {
|
|
Selector string `json:"selector"`
|
|
Timeout int `json:"timeout,omitempty"` // ms
|
|
}
|
|
|
|
// GotoOptions controls page navigation behavior.
|
|
type GotoOptions struct {
|
|
Timeout int `json:"timeout,omitempty"` // ms
|
|
WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load"
|
|
}
|
|
|
|
// ContentRequest is the body sent to POST /content.
|
|
type ContentRequest struct {
|
|
URL string `json:"url"`
|
|
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
|
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
|
|
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
|
|
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
|
BestAttempt bool `json:"bestAttempt,omitempty"` // return partial content on timeout/error
|
|
}
|
|
|
|
// ScrapeElement is one element descriptor inside a ScrapeRequest.
|
|
type ScrapeElement struct {
|
|
Selector string `json:"selector"`
|
|
Timeout int `json:"timeout,omitempty"` // ms
|
|
}
|
|
|
|
// ScrapeRequest is the body sent to POST /scrape.
|
|
type ScrapeRequest struct {
|
|
URL string `json:"url"`
|
|
Elements []ScrapeElement `json:"elements"`
|
|
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
|
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
|
}
|
|
|
|
// ScrapeResult is one entry in the response from POST /scrape.
|
|
type ScrapeResult struct {
|
|
Selector string `json:"selector"`
|
|
Results []ScrapeElement `json:"results"`
|
|
}
|
|
|
|
// ScrapeAttribute holds a single attribute value from a scraped element.
|
|
type ScrapeAttribute struct {
|
|
Name string `json:"name"`
|
|
Value string `json:"value"`
|
|
}
|
|
|
|
// ScrapedElement is one item inside ScrapeResult.Results.
|
|
type ScrapedElement struct {
|
|
Text string `json:"text"`
|
|
Attributes []ScrapeAttribute `json:"attributes"`
|
|
}
|
|
|
|
// ScrapeResponse is the top-level response from POST /scrape.
|
|
type ScrapeResponse struct {
|
|
Data []ScrapeResult `json:"data"`
|
|
}
|
|
|
|
// BrowserClient is an abstraction over the three Browserless API strategies.
|
|
// Callers choose the strategy best suited to the target site; the interface
|
|
// signature is identical regardless of strategy.
|
|
type BrowserClient interface {
|
|
// Strategy returns the strategy this client uses.
|
|
Strategy() Strategy
|
|
|
|
// GetContent fetches the fully-rendered HTML of url using the /content
|
|
// endpoint. Only meaningful when Strategy() == StrategyContent.
|
|
GetContent(ctx context.Context, req ContentRequest) (string, error)
|
|
|
|
// ScrapePage calls the /scrape endpoint and returns structured data.
|
|
// Only meaningful when Strategy() == StrategyScrape.
|
|
ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error)
|
|
|
|
// CDPSession opens a CDP WebSocket session and calls fn with the raw
|
|
// WebSocket connection. Only meaningful when Strategy() == StrategyCDP.
|
|
// The session is closed when fn returns.
|
|
CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error
|
|
}
|
|
|
|
// CDPSessionFunc is the callback invoked inside a CDP session.
|
|
// conn is a live *websocket.Conn connected to a Browserless CDP endpoint.
|
|
type CDPSessionFunc func(ctx context.Context, conn CDPConn) error
|
|
|
|
// CDPConn is the minimal interface the orchestrator needs over a CDP WebSocket.
|
|
type CDPConn interface {
|
|
// Send sends a raw CDP command (JSON-encoded) and returns the response.
|
|
Send(ctx context.Context, method string, params map[string]any) (map[string]any, error)
|
|
// Close closes the underlying connection.
|
|
Close() error
|
|
}
|