fix(scraper): add Brotli decompression to HTTP client
Some checks failed
CI / Scraper / Lint (push) Failing after 29s
CI / Scraper / Lint (pull_request) Failing after 29s
CI / Scraper / Test (push) Failing after 38s
CI / Scraper / Docker Push (push) Has been skipped
CI / UI / Build (pull_request) Successful in 47s
CI / UI / Docker Push (pull_request) Has been skipped
CI / Scraper / Test (pull_request) Successful in 54s
CI / Scraper / Docker Push (pull_request) Has been skipped
iOS CI / Build (pull_request) Successful in 3m35s
iOS CI / Test (pull_request) Successful in 5m47s
Some checks failed
CI / Scraper / Lint (push) Failing after 29s
CI / Scraper / Lint (pull_request) Failing after 29s
CI / Scraper / Test (push) Failing after 38s
CI / Scraper / Docker Push (push) Has been skipped
CI / UI / Build (pull_request) Successful in 47s
CI / UI / Docker Push (pull_request) Has been skipped
CI / Scraper / Test (pull_request) Successful in 54s
CI / Scraper / Docker Push (pull_request) Has been skipped
iOS CI / Build (pull_request) Successful in 3m35s
iOS CI / Test (pull_request) Successful in 5m47s
novelfire.net responds with Content-Encoding: br when the scraper advertises 'gzip, deflate, br'. The client only handled gzip, so Brotli-compressed bytes were fed raw into the HTML parser producing garbage — empty titles, zero chapters, and selector failures. Added github.com/andybalholm/brotli and wired it into GetContent alongside the existing gzip path.
This commit is contained in:
@@ -10,6 +10,7 @@ require (
|
||||
|
||||
require (
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c // indirect
|
||||
github.com/andybalholm/brotli v1.2.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-ini/ini v1.67.0 // indirect
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c h1:pxW6RcqyfI9/kWtOwnv/G+AzdKuy2ZrqINhenH4HyNs=
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
|
||||
@@ -10,6 +10,8 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/andybalholm/brotli"
|
||||
)
|
||||
|
||||
type httpClient struct {
|
||||
@@ -106,16 +108,17 @@ func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string
|
||||
// net/http decompresses gzip automatically only when it sets the header
|
||||
// itself; since we set Accept-Encoding explicitly we must do it ourselves.
|
||||
body := resp.Body
|
||||
if strings.EqualFold(resp.Header.Get("Content-Encoding"), "gzip") {
|
||||
switch strings.ToLower(resp.Header.Get("Content-Encoding")) {
|
||||
case "gzip":
|
||||
gr, gzErr := gzip.NewReader(resp.Body)
|
||||
if gzErr != nil {
|
||||
return "", fmt.Errorf("http: gzip reader: %w", gzErr)
|
||||
}
|
||||
defer gr.Close()
|
||||
body = gr
|
||||
case "br":
|
||||
body = io.NopCloser(brotli.NewReader(resp.Body))
|
||||
}
|
||||
// br (Brotli) decompression requires an external package; skip for now —
|
||||
// the server will fall back to gzip or plain text for unknown encodings.
|
||||
|
||||
raw, err := io.ReadAll(body)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user