libnovel/scraper/internal/browser/http.go

package browser

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"time"
)

type httpClient struct {
	cfg  Config
	http *http.Client
	sem  chan struct{}
}

func NewDirectHTTPClient(cfg Config) BrowserClient {
	if cfg.Timeout == 0 {
		cfg.Timeout = 30 * time.Second
	}
	return &httpClient{
		cfg:  cfg,
		http: &http.Client{Timeout: cfg.Timeout},
		sem:  makeSem(cfg.MaxConcurrent),
	}
}

func (c *httpClient) Strategy() Strategy { return StrategyDirect }

func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
	if err := acquire(ctx, c.sem); err != nil {
		return "", fmt.Errorf("http: semaphore: %w", err)
	}
	defer release(c.sem)

	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, req.URL, nil)
	if err != nil {
		return "", fmt.Errorf("http: build request: %w", err)
	}
	httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
	httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
	httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")

	resp, err := c.http.Do(httpReq)
	if err != nil {
		return "", fmt.Errorf("http: do request: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		b, _ := io.ReadAll(resp.Body)
		return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
	}

	raw, err := io.ReadAll(resp.Body)
	if err != nil {
		return "", fmt.Errorf("http: read body: %w", err)
	}
	return string(raw), nil
}

func (c *httpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
	return ScrapeResponse{}, fmt.Errorf("http client does not support ScrapePage; use browserless")
}

func (c *httpClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
	return fmt.Errorf("http client does not support CDP; use browserless")
}