Files
libnovel/scraper/internal/browser/content_scrape.go
Admin e7b915c6aa chore: update Browserless port references from 3000 to 3030
Update all default URLs, port mappings, healthcheck endpoints, Dockerfile
ENV defaults, and integration test run instructions to use port 3030.
2026-03-01 14:51:28 +05:00

196 lines
5.5 KiB
Go

package browser
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
)
// Config holds the connection parameters for a Browserless instance.
type Config struct {
// BaseURL is the HTTP base URL, e.g. "http://localhost:3030".
BaseURL string
// Token is the optional API token (BROWSERLESS_TOKEN env var).
Token string
// Timeout is the per-request HTTP timeout; defaults to 60 s.
Timeout time.Duration
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
// Browserless. When all slots are occupied new calls block until one
// completes (or ctx is cancelled). 0 means no limit.
MaxConcurrent int
}
// makeSem returns a buffered channel used as a counting semaphore.
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
func makeSem(n int) chan struct{} {
if n <= 0 {
return nil
}
return make(chan struct{}, n)
}
// acquire takes one slot from sem. It returns an error if ctx is cancelled
// before a slot becomes available. If sem is nil it returns immediately.
func acquire(ctx context.Context, sem chan struct{}) error {
if sem == nil {
return nil
}
select {
case sem <- struct{}{}:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// release frees the slot previously obtained by acquire.
// If sem is nil it is a no-op.
func release(sem chan struct{}) {
if sem != nil {
<-sem
}
}
// contentClient implements BrowserClient using the /content endpoint.
type contentClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewContentClient returns a BrowserClient that uses POST /content.
func NewContentClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 90 * time.Second
}
return &contentClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *contentClient) Strategy() Strategy { return StrategyContent }
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
if err := acquire(ctx, c.sem); err != nil {
return "", fmt.Errorf("content: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return "", fmt.Errorf("content: marshal request: %w", err)
}
url := c.cfg.BaseURL + "/content"
if c.cfg.Token != "" {
url += "?token=" + c.cfg.Token
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return "", fmt.Errorf("content: build request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.http.Do(httpReq)
if err != nil {
return "", fmt.Errorf("content: do request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
b, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
}
raw, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("content: read body: %w", err)
}
return string(raw), nil
}
func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
}
func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
return fmt.Errorf("content client does not support CDP; use NewCDPClient")
}
// ─── /scrape client ───────────────────────────────────────────────────────────
type scrapeClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
func NewScrapeClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 90 * time.Second
}
return &scrapeClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }
func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
}
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
if err := acquire(ctx, c.sem); err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
}
defer release(c.sem)
body, err := json.Marshal(req)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
}
url := c.cfg.BaseURL + "/scrape"
if c.cfg.Token != "" {
url += "?token=" + c.cfg.Token
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.http.Do(httpReq)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
b, _ := io.ReadAll(resp.Body)
return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
}
var result ScrapeResponse
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
}
return result, nil
}
func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
}