Update all default URLs, port mappings, healthcheck endpoints, Dockerfile ENV defaults, and integration test run instructions to use port 3030.
196 lines
5.5 KiB
Go
196 lines
5.5 KiB
Go
package browser
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
// Config holds the connection parameters for a Browserless instance.
|
|
type Config struct {
|
|
// BaseURL is the HTTP base URL, e.g. "http://localhost:3030".
|
|
BaseURL string
|
|
// Token is the optional API token (BROWSERLESS_TOKEN env var).
|
|
Token string
|
|
// Timeout is the per-request HTTP timeout; defaults to 60 s.
|
|
Timeout time.Duration
|
|
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
|
|
// Browserless. When all slots are occupied new calls block until one
|
|
// completes (or ctx is cancelled). 0 means no limit.
|
|
MaxConcurrent int
|
|
}
|
|
|
|
// makeSem returns a buffered channel used as a counting semaphore.
|
|
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
|
|
func makeSem(n int) chan struct{} {
|
|
if n <= 0 {
|
|
return nil
|
|
}
|
|
return make(chan struct{}, n)
|
|
}
|
|
|
|
// acquire takes one slot from sem. It returns an error if ctx is cancelled
|
|
// before a slot becomes available. If sem is nil it returns immediately.
|
|
func acquire(ctx context.Context, sem chan struct{}) error {
|
|
if sem == nil {
|
|
return nil
|
|
}
|
|
select {
|
|
case sem <- struct{}{}:
|
|
return nil
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
|
|
// release frees the slot previously obtained by acquire.
|
|
// If sem is nil it is a no-op.
|
|
func release(sem chan struct{}) {
|
|
if sem != nil {
|
|
<-sem
|
|
}
|
|
}
|
|
|
|
// contentClient implements BrowserClient using the /content endpoint.
|
|
type contentClient struct {
|
|
cfg Config
|
|
http *http.Client
|
|
sem chan struct{}
|
|
}
|
|
|
|
// NewContentClient returns a BrowserClient that uses POST /content.
|
|
func NewContentClient(cfg Config) BrowserClient {
|
|
if cfg.Timeout == 0 {
|
|
cfg.Timeout = 90 * time.Second
|
|
}
|
|
return &contentClient{
|
|
cfg: cfg,
|
|
http: &http.Client{Timeout: cfg.Timeout},
|
|
sem: makeSem(cfg.MaxConcurrent),
|
|
}
|
|
}
|
|
|
|
func (c *contentClient) Strategy() Strategy { return StrategyContent }
|
|
|
|
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
|
if err := acquire(ctx, c.sem); err != nil {
|
|
return "", fmt.Errorf("content: semaphore: %w", err)
|
|
}
|
|
defer release(c.sem)
|
|
|
|
body, err := json.Marshal(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: marshal request: %w", err)
|
|
}
|
|
|
|
url := c.cfg.BaseURL + "/content"
|
|
if c.cfg.Token != "" {
|
|
url += "?token=" + c.cfg.Token
|
|
}
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: build request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := c.http.Do(httpReq)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: do request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
b, _ := io.ReadAll(resp.Body)
|
|
return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
|
|
}
|
|
|
|
raw, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: read body: %w", err)
|
|
}
|
|
return string(raw), nil
|
|
}
|
|
|
|
func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
|
return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
|
|
}
|
|
|
|
func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
|
return fmt.Errorf("content client does not support CDP; use NewCDPClient")
|
|
}
|
|
|
|
// ─── /scrape client ───────────────────────────────────────────────────────────
|
|
|
|
type scrapeClient struct {
|
|
cfg Config
|
|
http *http.Client
|
|
sem chan struct{}
|
|
}
|
|
|
|
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
|
|
func NewScrapeClient(cfg Config) BrowserClient {
|
|
if cfg.Timeout == 0 {
|
|
cfg.Timeout = 90 * time.Second
|
|
}
|
|
return &scrapeClient{
|
|
cfg: cfg,
|
|
http: &http.Client{Timeout: cfg.Timeout},
|
|
sem: makeSem(cfg.MaxConcurrent),
|
|
}
|
|
}
|
|
|
|
func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }
|
|
|
|
func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
|
|
return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
|
|
}
|
|
|
|
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
|
|
if err := acquire(ctx, c.sem); err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
|
|
}
|
|
defer release(c.sem)
|
|
|
|
body, err := json.Marshal(req)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
|
|
}
|
|
|
|
url := c.cfg.BaseURL + "/scrape"
|
|
if c.cfg.Token != "" {
|
|
url += "?token=" + c.cfg.Token
|
|
}
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := c.http.Do(httpReq)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
b, _ := io.ReadAll(resp.Body)
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
|
|
}
|
|
|
|
var result ScrapeResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
|
return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
|
|
}
|