Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a49cb5e75 | ||
|
|
1642434a79 |
@@ -12,6 +12,10 @@
|
||||
//
|
||||
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
|
||||
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
// SCRAPER_PROXY Outbound proxy for all scraper requests, e.g.
|
||||
// http://user:pass@proxy-host:3128 — use a
|
||||
// residential proxy to bypass datacenter IP blocks.
|
||||
// Falls back to HTTP_PROXY / HTTPS_PROXY if unset.
|
||||
// KOKORO_URL Kokoro-FastAPI base URL (default: "")
|
||||
// KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
// POCKETBASE_URL PocketBase API base URL (default: http://localhost:8090)
|
||||
@@ -493,6 +497,7 @@ Environment variables:
|
||||
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
|
||||
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
SCRAPER_TIMEOUT HTTP request timeout sec (default: 90)
|
||||
SCRAPER_PROXY Outbound proxy URL (default: "", falls back to HTTP_PROXY/HTTPS_PROXY)
|
||||
KOKORO_URL Kokoro-FastAPI base URL (default: "", TTS disabled)
|
||||
KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
POCKETBASE_URL PocketBase base URL (default: http://localhost:8090)
|
||||
|
||||
@@ -10,6 +10,7 @@ require (
|
||||
|
||||
require (
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c // indirect
|
||||
github.com/andybalholm/brotli v1.2.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-ini/ini v1.67.0 // indirect
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c h1:pxW6RcqyfI9/kWtOwnv/G+AzdKuy2ZrqINhenH4HyNs=
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/andybalholm/brotli"
|
||||
)
|
||||
|
||||
type httpClient struct {
|
||||
@@ -18,11 +24,41 @@ func NewDirectHTTPClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 30 * time.Second
|
||||
}
|
||||
return &httpClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
|
||||
transport := http.DefaultTransport.(*http.Transport).Clone()
|
||||
|
||||
// Wire in proxy from environment (HTTP_PROXY / HTTPS_PROXY / NO_PROXY).
|
||||
// This lets operators route traffic through a residential proxy by simply
|
||||
// setting HTTPS_PROXY=http://user:pass@proxy-host:port without any code
|
||||
// changes — the standard approach for bypassing datacenter IP blocks.
|
||||
if proxyURL := proxyFromEnv(); proxyURL != nil {
|
||||
transport.Proxy = http.ProxyURL(proxyURL)
|
||||
} else {
|
||||
transport.Proxy = http.ProxyFromEnvironment
|
||||
}
|
||||
|
||||
return &httpClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{
|
||||
Timeout: cfg.Timeout,
|
||||
Transport: transport,
|
||||
},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
// proxyFromEnv returns an explicit proxy URL if SCRAPER_PROXY is set, otherwise
|
||||
// nil (and http.ProxyFromEnvironment handles the standard HTTP_PROXY / HTTPS_PROXY).
|
||||
func proxyFromEnv() *url.URL {
|
||||
raw := os.Getenv("SCRAPER_PROXY")
|
||||
if raw == "" {
|
||||
return nil
|
||||
}
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil || u.Host == "" {
|
||||
return nil
|
||||
}
|
||||
return u
|
||||
}
|
||||
|
||||
func (c *httpClient) Strategy() Strategy { return StrategyDirect }
|
||||
@@ -37,9 +73,25 @@ func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
// Mimic a real Chrome browser request to reduce bot-detection likelihood.
|
||||
// These headers match what Chrome 124 sends for a top-level navigation.
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
||||
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
|
||||
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
httpReq.Header.Set("Accept-Encoding", "gzip, deflate, br")
|
||||
httpReq.Header.Set("Connection", "keep-alive")
|
||||
httpReq.Header.Set("Upgrade-Insecure-Requests", "1")
|
||||
httpReq.Header.Set("Sec-Fetch-Dest", "document")
|
||||
httpReq.Header.Set("Sec-Fetch-Mode", "navigate")
|
||||
httpReq.Header.Set("Sec-Fetch-Site", "none")
|
||||
httpReq.Header.Set("Sec-Fetch-User", "?1")
|
||||
httpReq.Header.Set("Cache-Control", "max-age=0")
|
||||
|
||||
// Set Referer for subsequent page requests (anything that is not the root).
|
||||
if parsed, pErr := url.Parse(req.URL); pErr == nil && parsed.Path != "" && parsed.Path != "/" {
|
||||
httpReq.Header.Set("Referer", parsed.Scheme+"://"+parsed.Host+"/")
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
@@ -52,7 +104,23 @@ func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string
|
||||
return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
// Decompress gzip/br responses when the server honours Accept-Encoding.
|
||||
// net/http decompresses gzip automatically only when it sets the header
|
||||
// itself; since we set Accept-Encoding explicitly we must do it ourselves.
|
||||
body := resp.Body
|
||||
switch strings.ToLower(resp.Header.Get("Content-Encoding")) {
|
||||
case "gzip":
|
||||
gr, gzErr := gzip.NewReader(resp.Body)
|
||||
if gzErr != nil {
|
||||
return "", fmt.Errorf("http: gzip reader: %w", gzErr)
|
||||
}
|
||||
defer gr.Close()
|
||||
body = gr
|
||||
case "br":
|
||||
body = io.NopCloser(brotli.NewReader(resp.Body))
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: read body: %w", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user