Files
libnovel/backend/internal/browser/browser.go
Admin 59e8cdb19a
Some checks failed
CI / v3 / Check ui (pull_request) Failing after 15s
CI / v3 / Test backend (pull_request) Failing after 16s
CI / v3 / Docker / backend (pull_request) Has been skipped
CI / v3 / Docker / runner (pull_request) Has been skipped
CI / v3 / Docker / ui (pull_request) Has been skipped
chore: migrate to v3, Doppler secrets, clean up legacy code
- Remove all pre-v3 code: scraper, ui-v2, backend v1, ios v1+v2, legacy CI workflows
- Flatten v3/ contents to repo root
- Add Doppler secrets management (project=libnovel, config=prd)
- Add justfile with doppler run wrappers for all docker compose commands
- Strip hardcoded env fallbacks from docker-compose.yml
- Add minimal README.md
- Clean up .gitignore
2026-03-23 17:21:12 +05:00

192 lines
5.5 KiB
Go

// Package browser provides a rate-limited HTTP client for web scraping.
package browser
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"sync"
"time"
)
// ErrRateLimit is returned by GetContent when the server responds with 429.
// It carries the suggested retry delay (from Retry-After header, or a default).
var ErrRateLimit = errors.New("rate limited (429)")
// RateLimitError wraps ErrRateLimit and carries the suggested wait duration.
type RateLimitError struct {
// RetryAfter is how long the caller should wait before retrying.
// Derived from the Retry-After response header when present; otherwise a default.
RetryAfter time.Duration
}
func (e *RateLimitError) Error() string {
return fmt.Sprintf("rate limited (429): retry after %s", e.RetryAfter)
}
func (e *RateLimitError) Is(target error) bool { return target == ErrRateLimit }
// defaultRateLimitDelay is used when the server returns 429 with no Retry-After header.
const defaultRateLimitDelay = 60 * time.Second
// Client is the interface used by scrapers to fetch raw page HTML.
// Implementations must be safe for concurrent use.
type Client interface {
// GetContent fetches the URL and returns the full response body as a string.
// It should respect the provided context for cancellation and timeouts.
GetContent(ctx context.Context, pageURL string) (string, error)
}
// Config holds tunable parameters for the direct HTTP client.
type Config struct {
// MaxConcurrent limits the number of simultaneous in-flight requests.
// Defaults to 5 when 0.
MaxConcurrent int
// Timeout is the per-request deadline. Defaults to 90s when 0.
Timeout time.Duration
}
// DirectClient is a plain net/http-based Client with a concurrency semaphore.
type DirectClient struct {
http *http.Client
semaphore chan struct{}
}
// NewDirectClient returns a DirectClient configured by cfg.
func NewDirectClient(cfg Config) *DirectClient {
if cfg.MaxConcurrent <= 0 {
cfg.MaxConcurrent = 5
}
if cfg.Timeout <= 0 {
cfg.Timeout = 90 * time.Second
}
transport := &http.Transport{
MaxIdleConnsPerHost: cfg.MaxConcurrent * 2,
DisableCompression: false,
}
return &DirectClient{
http: &http.Client{
Transport: transport,
Timeout: cfg.Timeout,
},
semaphore: make(chan struct{}, cfg.MaxConcurrent),
}
}
// GetContent fetches pageURL respecting the concurrency limit.
func (c *DirectClient) GetContent(ctx context.Context, pageURL string) (string, error) {
// Acquire semaphore slot.
select {
case c.semaphore <- struct{}{}:
case <-ctx.Done():
return "", ctx.Err()
}
defer func() { <-c.semaphore }()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pageURL, nil)
if err != nil {
return "", fmt.Errorf("browser: build request %s: %w", pageURL, err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; libnovel-runner/2)")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := c.http.Do(req)
if err != nil {
return "", fmt.Errorf("browser: GET %s: %w", pageURL, err)
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusTooManyRequests {
delay := defaultRateLimitDelay
if ra := resp.Header.Get("Retry-After"); ra != "" {
if secs, err := strconv.Atoi(ra); err == nil && secs > 0 {
delay = time.Duration(secs) * time.Second
}
}
return "", &RateLimitError{RetryAfter: delay}
}
if resp.StatusCode >= 400 {
return "", fmt.Errorf("browser: GET %s returned %d", pageURL, resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("browser: read body %s: %w", pageURL, err)
}
return string(body), nil
}
// Do implements httputil.Client so DirectClient can be passed to RetryGet.
func (c *DirectClient) Do(req *http.Request) (*http.Response, error) {
select {
case c.semaphore <- struct{}{}:
case <-req.Context().Done():
return nil, req.Context().Err()
}
defer func() { <-c.semaphore }()
return c.http.Do(req)
}
// ── Stub for testing ──────────────────────────────────────────────────────────
// StubClient is a test double for Client. It returns pre-configured responses
// keyed on URL. Calls to unknown URLs return an error.
type StubClient struct {
mu sync.Mutex
pages map[string]string
errors map[string]error
callLog []string
}
// NewStub creates a StubClient with no pages pre-loaded.
func NewStub() *StubClient {
return &StubClient{
pages: make(map[string]string),
errors: make(map[string]error),
}
}
// SetPage registers a URL → HTML body mapping.
func (s *StubClient) SetPage(u, html string) {
s.mu.Lock()
s.pages[u] = html
s.mu.Unlock()
}
// SetError registers a URL → error mapping (returned instead of a body).
func (s *StubClient) SetError(u string, err error) {
s.mu.Lock()
s.errors[u] = err
s.mu.Unlock()
}
// CallLog returns the ordered list of URLs that were requested.
func (s *StubClient) CallLog() []string {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]string, len(s.callLog))
copy(out, s.callLog)
return out
}
// GetContent returns the registered page or an error for the URL.
func (s *StubClient) GetContent(_ context.Context, pageURL string) (string, error) {
s.mu.Lock()
defer s.mu.Unlock()
s.callLog = append(s.callLog, pageURL)
if err, ok := s.errors[pageURL]; ok {
return "", err
}
if html, ok := s.pages[pageURL]; ok {
return html, nil
}
return "", fmt.Errorf("stub: no page registered for %q", pageURL)
}