libnovel/scraper/internal/browser/interface.go

// Package browser defines the BrowserClient interface and helper types for
// communicating with a Browserless instance.
package browser

import "context"

// Strategy selects which Browserless API endpoint / protocol to use.
type Strategy string

const (
	// StrategyContent uses the POST /content endpoint, which returns the final
	// rendered HTML of the page. Fastest; suitable for most JS-rendered sites.
	StrategyContent Strategy = "content"

	// StrategyScrape uses the POST /scrape endpoint, which accepts a list of
	// CSS selectors and returns structured JSON. Good when you know exactly
	// which elements you need.
	StrategyScrape Strategy = "scrape"

	// StrategyCDP uses the WebSocket /devtools/browser endpoint (Chrome
	// DevTools Protocol). Most powerful; required for complex interactions
	// (clicking, scrolling, waiting for network idle, etc.).
	StrategyCDP Strategy = "cdp"

	// StrategyDirect uses a plain HTTP client to fetch HTML directly.
	// Suitable for sites that don't require JavaScript rendering.
	StrategyDirect Strategy = "direct"
)

// WaitForSelector describes the waitForSelector option sent to Browserless.
type WaitForSelector struct {
	Selector string `json:"selector"`
	Timeout  int    `json:"timeout,omitempty"` // ms
}

// GotoOptions controls page navigation behavior.
type GotoOptions struct {
	Timeout   int    `json:"timeout,omitempty"`   // ms
	WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load"
}

// ContentRequest is the body sent to POST /content.
type ContentRequest struct {
	URL                 string           `json:"url"`
	WaitFor             *WaitForSelector `json:"waitForSelector,omitempty"`
	WaitForTimeout      int              `json:"waitForTimeout,omitempty"`      // ms
	RejectResourceTypes []string         `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
	GotoOptions         *GotoOptions     `json:"gotoOptions,omitempty"`
	BestAttempt         bool             `json:"bestAttempt,omitempty"` // return partial content on timeout/error
}

// ScrapeElement is one element descriptor inside a ScrapeRequest.
type ScrapeElement struct {
	Selector string `json:"selector"`
	Timeout  int    `json:"timeout,omitempty"` // ms
}

// ScrapeRequest is the body sent to POST /scrape.
type ScrapeRequest struct {
	URL         string           `json:"url"`
	Elements    []ScrapeElement  `json:"elements"`
	WaitFor     *WaitForSelector `json:"waitForSelector,omitempty"`
	GotoOptions *GotoOptions     `json:"gotoOptions,omitempty"`
}

// ScrapeResult is one entry in the response from POST /scrape.
type ScrapeResult struct {
	Selector string          `json:"selector"`
	Results  []ScrapeElement `json:"results"`
}

// ScrapeAttribute holds a single attribute value from a scraped element.
type ScrapeAttribute struct {
	Name  string `json:"name"`
	Value string `json:"value"`
}

// ScrapedElement is one item inside ScrapeResult.Results.
type ScrapedElement struct {
	Text       string            `json:"text"`
	Attributes []ScrapeAttribute `json:"attributes"`
}

// ScrapeResponse is the top-level response from POST /scrape.
type ScrapeResponse struct {
	Data []ScrapeResult `json:"data"`
}

// BrowserClient is an abstraction over the three Browserless API strategies.
// Callers choose the strategy best suited to the target site; the interface
// signature is identical regardless of strategy.
type BrowserClient interface {
	// Strategy returns the strategy this client uses.
	Strategy() Strategy

	// GetContent fetches the fully-rendered HTML of url using the /content
	// endpoint. Only meaningful when Strategy() == StrategyContent.
	GetContent(ctx context.Context, req ContentRequest) (string, error)

	// ScrapePage calls the /scrape endpoint and returns structured data.
	// Only meaningful when Strategy() == StrategyScrape.
	ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error)

	// CDPSession opens a CDP WebSocket session and calls fn with the raw
	// WebSocket connection. Only meaningful when Strategy() == StrategyCDP.
	// The session is closed when fn returns.
	CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error
}

// CDPSessionFunc is the callback invoked inside a CDP session.
// conn is a live *websocket.Conn connected to a Browserless CDP endpoint.
type CDPSessionFunc func(ctx context.Context, conn CDPConn) error

// CDPConn is the minimal interface the orchestrator needs over a CDP WebSocket.
type CDPConn interface {
	// Send sends a raw CDP command (JSON-encoded) and returns the response.
	Send(ctx context.Context, method string, params map[string]any) (map[string]any, error)
	// Close closes the underlying connection.
	Close() error
}