Compare commits

...

2 Commits
v1.0.8 ... v2

Author SHA1 Message Date
Admin
1a49cb5e75 fix(scraper): add Brotli decompression to HTTP client
Some checks failed
CI / Scraper / Lint (push) Failing after 29s
CI / Scraper / Lint (pull_request) Failing after 29s
CI / Scraper / Test (push) Failing after 38s
CI / Scraper / Docker Push (push) Has been skipped
CI / UI / Build (pull_request) Successful in 47s
CI / UI / Docker Push (pull_request) Has been skipped
CI / Scraper / Test (pull_request) Successful in 54s
CI / Scraper / Docker Push (pull_request) Has been skipped
iOS CI / Build (pull_request) Successful in 3m35s
iOS CI / Test (pull_request) Successful in 5m47s
novelfire.net responds with Content-Encoding: br when the scraper
advertises 'gzip, deflate, br'. The client only handled gzip, so
Brotli-compressed bytes were fed raw into the HTML parser producing
garbage — empty titles, zero chapters, and selector failures.

Added github.com/andybalholm/brotli and wired it into GetContent
alongside the existing gzip path.
2026-03-20 11:20:50 +05:00
Admin
1642434a79 feat(scraper): harden browser headers and add proxy support
All checks were successful
CI / Scraper / Lint (push) Successful in 8s
CI / Scraper / Lint (pull_request) Successful in 12s
CI / Scraper / Test (pull_request) Successful in 8s
CI / Scraper / Test (push) Successful in 16s
Release / Scraper / Test (push) Successful in 8s
Release / UI / Build (push) Successful in 24s
CI / Scraper / Docker Push (pull_request) Has been skipped
CI / UI / Build (pull_request) Successful in 30s
CI / Scraper / Docker Push (push) Successful in 39s
CI / UI / Docker Push (pull_request) Has been skipped
Release / UI / Docker (push) Successful in 40s
Release / Scraper / Docker (push) Successful in 50s
iOS CI / Build (pull_request) Successful in 3m7s
iOS CI / Test (pull_request) Successful in 5m23s
Upgrade DirectHTTPClient to send a full Chrome 124 header set
(Sec-Fetch-*, Accept-Encoding with gzip decompression, Referer,
Cache-Control) to reduce bot-detection false positives on WAFs.

Add SCRAPER_PROXY env var to route all outbound scrape requests
through a configurable proxy (residential or otherwise); falls back
to the standard HTTP_PROXY / HTTPS_PROXY env vars.
2026-03-14 18:40:32 +05:00
4 changed files with 84 additions and 8 deletions

View File

@@ -12,6 +12,10 @@
//
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
// SCRAPER_PROXY Outbound proxy for all scraper requests, e.g.
// http://user:pass@proxy-host:3128 — use a
// residential proxy to bypass datacenter IP blocks.
// Falls back to HTTP_PROXY / HTTPS_PROXY if unset.
// KOKORO_URL Kokoro-FastAPI base URL (default: "")
// KOKORO_VOICE Default TTS voice (default: af_bella)
// POCKETBASE_URL PocketBase API base URL (default: http://localhost:8090)
@@ -493,6 +497,7 @@ Environment variables:
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
SCRAPER_TIMEOUT HTTP request timeout sec (default: 90)
SCRAPER_PROXY Outbound proxy URL (default: "", falls back to HTTP_PROXY/HTTPS_PROXY)
KOKORO_URL Kokoro-FastAPI base URL (default: "", TTS disabled)
KOKORO_VOICE Default TTS voice (default: af_bella)
POCKETBASE_URL PocketBase base URL (default: http://localhost:8090)

View File

@@ -10,6 +10,7 @@ require (
require (
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c // indirect
github.com/andybalholm/brotli v1.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/go-ini/ini v1.67.0 // indirect

View File

@@ -1,5 +1,7 @@
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c h1:pxW6RcqyfI9/kWtOwnv/G+AzdKuy2ZrqINhenH4HyNs=
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=

View File

@@ -1,11 +1,17 @@
package browser
import (
"compress/gzip"
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
"github.com/andybalholm/brotli"
)
type httpClient struct {
@@ -18,11 +24,41 @@ func NewDirectHTTPClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 30 * time.Second
}
return &httpClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
transport := http.DefaultTransport.(*http.Transport).Clone()
// Wire in proxy from environment (HTTP_PROXY / HTTPS_PROXY / NO_PROXY).
// This lets operators route traffic through a residential proxy by simply
// setting HTTPS_PROXY=http://user:pass@proxy-host:port without any code
// changes — the standard approach for bypassing datacenter IP blocks.
if proxyURL := proxyFromEnv(); proxyURL != nil {
transport.Proxy = http.ProxyURL(proxyURL)
} else {
transport.Proxy = http.ProxyFromEnvironment
}
return &httpClient{
cfg: cfg,
http: &http.Client{
Timeout: cfg.Timeout,
Transport: transport,
},
sem: makeSem(cfg.MaxConcurrent),
}
}
// proxyFromEnv returns an explicit proxy URL if SCRAPER_PROXY is set, otherwise
// nil (and http.ProxyFromEnvironment handles the standard HTTP_PROXY / HTTPS_PROXY).
func proxyFromEnv() *url.URL {
raw := os.Getenv("SCRAPER_PROXY")
if raw == "" {
return nil
}
u, err := url.Parse(raw)
if err != nil || u.Host == "" {
return nil
}
return u
}
func (c *httpClient) Strategy() Strategy { return StrategyDirect }
@@ -37,9 +73,25 @@ func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string
if err != nil {
return "", fmt.Errorf("http: build request: %w", err)
}
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")
// Mimic a real Chrome browser request to reduce bot-detection likelihood.
// These headers match what Chrome 124 sends for a top-level navigation.
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
httpReq.Header.Set("Accept-Encoding", "gzip, deflate, br")
httpReq.Header.Set("Connection", "keep-alive")
httpReq.Header.Set("Upgrade-Insecure-Requests", "1")
httpReq.Header.Set("Sec-Fetch-Dest", "document")
httpReq.Header.Set("Sec-Fetch-Mode", "navigate")
httpReq.Header.Set("Sec-Fetch-Site", "none")
httpReq.Header.Set("Sec-Fetch-User", "?1")
httpReq.Header.Set("Cache-Control", "max-age=0")
// Set Referer for subsequent page requests (anything that is not the root).
if parsed, pErr := url.Parse(req.URL); pErr == nil && parsed.Path != "" && parsed.Path != "/" {
httpReq.Header.Set("Referer", parsed.Scheme+"://"+parsed.Host+"/")
}
resp, err := c.http.Do(httpReq)
if err != nil {
@@ -52,7 +104,23 @@ func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string
return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
}
raw, err := io.ReadAll(resp.Body)
// Decompress gzip/br responses when the server honours Accept-Encoding.
// net/http decompresses gzip automatically only when it sets the header
// itself; since we set Accept-Encoding explicitly we must do it ourselves.
body := resp.Body
switch strings.ToLower(resp.Header.Get("Content-Encoding")) {
case "gzip":
gr, gzErr := gzip.NewReader(resp.Body)
if gzErr != nil {
return "", fmt.Errorf("http: gzip reader: %w", gzErr)
}
defer gr.Close()
body = gr
case "br":
body = io.NopCloser(brotli.NewReader(resp.Body))
}
raw, err := io.ReadAll(body)
if err != nil {
return "", fmt.Errorf("http: read body: %w", err)
}