Some checks failed
CI / v3 / Check ui (pull_request) Failing after 15s
CI / v3 / Test backend (pull_request) Failing after 16s
CI / v3 / Docker / backend (pull_request) Has been skipped
CI / v3 / Docker / runner (pull_request) Has been skipped
CI / v3 / Docker / ui (pull_request) Has been skipped
- Remove all pre-v3 code: scraper, ui-v2, backend v1, ios v1+v2, legacy CI workflows - Flatten v3/ contents to repo root - Add Doppler secrets management (project=libnovel, config=prd) - Add justfile with doppler run wrappers for all docker compose commands - Strip hardcoded env fallbacks from docker-compose.yml - Add minimal README.md - Clean up .gitignore
192 lines
5.5 KiB
Go
192 lines
5.5 KiB
Go
// Package browser provides a rate-limited HTTP client for web scraping.
|
|
package browser
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// ErrRateLimit is returned by GetContent when the server responds with 429.
|
|
// It carries the suggested retry delay (from Retry-After header, or a default).
|
|
var ErrRateLimit = errors.New("rate limited (429)")
|
|
|
|
// RateLimitError wraps ErrRateLimit and carries the suggested wait duration.
|
|
type RateLimitError struct {
|
|
// RetryAfter is how long the caller should wait before retrying.
|
|
// Derived from the Retry-After response header when present; otherwise a default.
|
|
RetryAfter time.Duration
|
|
}
|
|
|
|
func (e *RateLimitError) Error() string {
|
|
return fmt.Sprintf("rate limited (429): retry after %s", e.RetryAfter)
|
|
}
|
|
|
|
func (e *RateLimitError) Is(target error) bool { return target == ErrRateLimit }
|
|
|
|
// defaultRateLimitDelay is used when the server returns 429 with no Retry-After header.
|
|
const defaultRateLimitDelay = 60 * time.Second
|
|
|
|
// Client is the interface used by scrapers to fetch raw page HTML.
|
|
// Implementations must be safe for concurrent use.
|
|
type Client interface {
|
|
// GetContent fetches the URL and returns the full response body as a string.
|
|
// It should respect the provided context for cancellation and timeouts.
|
|
GetContent(ctx context.Context, pageURL string) (string, error)
|
|
}
|
|
|
|
// Config holds tunable parameters for the direct HTTP client.
|
|
type Config struct {
|
|
// MaxConcurrent limits the number of simultaneous in-flight requests.
|
|
// Defaults to 5 when 0.
|
|
MaxConcurrent int
|
|
// Timeout is the per-request deadline. Defaults to 90s when 0.
|
|
Timeout time.Duration
|
|
}
|
|
|
|
// DirectClient is a plain net/http-based Client with a concurrency semaphore.
|
|
type DirectClient struct {
|
|
http *http.Client
|
|
semaphore chan struct{}
|
|
}
|
|
|
|
// NewDirectClient returns a DirectClient configured by cfg.
|
|
func NewDirectClient(cfg Config) *DirectClient {
|
|
if cfg.MaxConcurrent <= 0 {
|
|
cfg.MaxConcurrent = 5
|
|
}
|
|
if cfg.Timeout <= 0 {
|
|
cfg.Timeout = 90 * time.Second
|
|
}
|
|
|
|
transport := &http.Transport{
|
|
MaxIdleConnsPerHost: cfg.MaxConcurrent * 2,
|
|
DisableCompression: false,
|
|
}
|
|
|
|
return &DirectClient{
|
|
http: &http.Client{
|
|
Transport: transport,
|
|
Timeout: cfg.Timeout,
|
|
},
|
|
semaphore: make(chan struct{}, cfg.MaxConcurrent),
|
|
}
|
|
}
|
|
|
|
// GetContent fetches pageURL respecting the concurrency limit.
|
|
func (c *DirectClient) GetContent(ctx context.Context, pageURL string) (string, error) {
|
|
// Acquire semaphore slot.
|
|
select {
|
|
case c.semaphore <- struct{}{}:
|
|
case <-ctx.Done():
|
|
return "", ctx.Err()
|
|
}
|
|
defer func() { <-c.semaphore }()
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pageURL, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("browser: build request %s: %w", pageURL, err)
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; libnovel-runner/2)")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
|
|
|
resp, err := c.http.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("browser: GET %s: %w", pageURL, err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusTooManyRequests {
|
|
delay := defaultRateLimitDelay
|
|
if ra := resp.Header.Get("Retry-After"); ra != "" {
|
|
if secs, err := strconv.Atoi(ra); err == nil && secs > 0 {
|
|
delay = time.Duration(secs) * time.Second
|
|
}
|
|
}
|
|
return "", &RateLimitError{RetryAfter: delay}
|
|
}
|
|
|
|
if resp.StatusCode >= 400 {
|
|
return "", fmt.Errorf("browser: GET %s returned %d", pageURL, resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("browser: read body %s: %w", pageURL, err)
|
|
}
|
|
return string(body), nil
|
|
}
|
|
|
|
// Do implements httputil.Client so DirectClient can be passed to RetryGet.
|
|
func (c *DirectClient) Do(req *http.Request) (*http.Response, error) {
|
|
select {
|
|
case c.semaphore <- struct{}{}:
|
|
case <-req.Context().Done():
|
|
return nil, req.Context().Err()
|
|
}
|
|
defer func() { <-c.semaphore }()
|
|
return c.http.Do(req)
|
|
}
|
|
|
|
// ── Stub for testing ──────────────────────────────────────────────────────────
|
|
|
|
// StubClient is a test double for Client. It returns pre-configured responses
|
|
// keyed on URL. Calls to unknown URLs return an error.
|
|
type StubClient struct {
|
|
mu sync.Mutex
|
|
pages map[string]string
|
|
errors map[string]error
|
|
callLog []string
|
|
}
|
|
|
|
// NewStub creates a StubClient with no pages pre-loaded.
|
|
func NewStub() *StubClient {
|
|
return &StubClient{
|
|
pages: make(map[string]string),
|
|
errors: make(map[string]error),
|
|
}
|
|
}
|
|
|
|
// SetPage registers a URL → HTML body mapping.
|
|
func (s *StubClient) SetPage(u, html string) {
|
|
s.mu.Lock()
|
|
s.pages[u] = html
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// SetError registers a URL → error mapping (returned instead of a body).
|
|
func (s *StubClient) SetError(u string, err error) {
|
|
s.mu.Lock()
|
|
s.errors[u] = err
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
// CallLog returns the ordered list of URLs that were requested.
|
|
func (s *StubClient) CallLog() []string {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
out := make([]string, len(s.callLog))
|
|
copy(out, s.callLog)
|
|
return out
|
|
}
|
|
|
|
// GetContent returns the registered page or an error for the URL.
|
|
func (s *StubClient) GetContent(_ context.Context, pageURL string) (string, error) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.callLog = append(s.callLog, pageURL)
|
|
if err, ok := s.errors[pageURL]; ok {
|
|
return "", err
|
|
}
|
|
if html, ok := s.pages[pageURL]; ok {
|
|
return html, nil
|
|
}
|
|
return "", fmt.Errorf("stub: no page registered for %q", pageURL)
|
|
}
|