libnovel/scraper/internal/browser/content_scrape.go

package browser

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"time"
)

// Config holds the connection parameters for a Browserless instance.
type Config struct {
	// BaseURL is the HTTP base URL, e.g. "http://localhost:3030".
	BaseURL string
	// Token is the optional API token (BROWSERLESS_TOKEN env var).
	Token string
	// Timeout is the per-request HTTP timeout; defaults to 60 s.
	Timeout time.Duration
	// MaxConcurrent caps the number of simultaneous in-flight requests sent to
	// Browserless. When all slots are occupied new calls block until one
	// completes (or ctx is cancelled). 0 means no limit.
	MaxConcurrent int
}

// makeSem returns a buffered channel used as a counting semaphore.
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
func makeSem(n int) chan struct{} {
	if n <= 0 {
		return nil
	}
	return make(chan struct{}, n)
}

// acquire takes one slot from sem. It returns an error if ctx is cancelled
// before a slot becomes available. If sem is nil it returns immediately.
func acquire(ctx context.Context, sem chan struct{}) error {
	if sem == nil {
		return nil
	}
	select {
	case sem <- struct{}{}:
		return nil
	case <-ctx.Done():
		return ctx.Err()
	}
}

// release frees the slot previously obtained by acquire.
// If sem is nil it is a no-op.
func release(sem chan struct{}) {
	if sem != nil {
		<-sem
	}
}

// contentClient implements BrowserClient using the /content endpoint.
type contentClient struct {
	cfg  Config
	http *http.Client
	sem  chan struct{}
}

// NewContentClient returns a BrowserClient that uses POST /content.
func NewContentClient(cfg Config) BrowserClient {
	if cfg.Timeout == 0 {
		cfg.Timeout = 90 * time.Second
	}
	return &contentClient{
		cfg:  cfg,
		http: &http.Client{Timeout: cfg.Timeout},
		sem:  makeSem(cfg.MaxConcurrent),
	}
}

func (c *contentClient) Strategy() Strategy { return StrategyContent }

func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
	if err := acquire(ctx, c.sem); err != nil {
		return "", fmt.Errorf("content: semaphore: %w", err)
	}
	defer release(c.sem)

	body, err := json.Marshal(req)
	if err != nil {
		return "", fmt.Errorf("content: marshal request: %w", err)
	}

	url := c.cfg.BaseURL + "/content"
	if c.cfg.Token != "" {
		url += "?token=" + c.cfg.Token
	}

	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
	if err != nil {
		return "", fmt.Errorf("content: build request: %w", err)
	}
	httpReq.Header.Set("Content-Type", "application/json")

	resp, err := c.http.Do(httpReq)
	if err != nil {
		return "", fmt.Errorf("content: do request: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		b, _ := io.ReadAll(resp.Body)
		return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
	}

	raw, err := io.ReadAll(resp.Body)
	if err != nil {
		return "", fmt.Errorf("content: read body: %w", err)
	}
	return string(raw), nil
}

func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
	return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
}

func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
	return fmt.Errorf("content client does not support CDP; use NewCDPClient")
}

// ─── /scrape client ───────────────────────────────────────────────────────────

type scrapeClient struct {
	cfg  Config
	http *http.Client
	sem  chan struct{}
}

// NewScrapeClient returns a BrowserClient that uses POST /scrape.
func NewScrapeClient(cfg Config) BrowserClient {
	if cfg.Timeout == 0 {
		cfg.Timeout = 90 * time.Second
	}
	return &scrapeClient{
		cfg:  cfg,
		http: &http.Client{Timeout: cfg.Timeout},
		sem:  makeSem(cfg.MaxConcurrent),
	}
}

func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }

func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
	return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
}

func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
	if err := acquire(ctx, c.sem); err != nil {
		return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
	}
	defer release(c.sem)

	body, err := json.Marshal(req)
	if err != nil {
		return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
	}

	url := c.cfg.BaseURL + "/scrape"
	if c.cfg.Token != "" {
		url += "?token=" + c.cfg.Token
	}

	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
	if err != nil {
		return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
	}
	httpReq.Header.Set("Content-Type", "application/json")

	resp, err := c.http.Do(httpReq)
	if err != nil {
		return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		b, _ := io.ReadAll(resp.Body)
		return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
	}

	var result ScrapeResponse
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
	}
	return result, nil
}

func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
	return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
}