Update all default URLs, port mappings, healthcheck endpoints, Dockerfile ENV defaults, and integration test run instructions to use port 3030.
153 lines
4.9 KiB
Go
153 lines
4.9 KiB
Go
//go:build integration
|
|
|
|
// Integration tests for the Browserless /content API.
|
|
//
|
|
// These tests require a live Browserless instance and are gated behind the
|
|
// "integration" build tag so they never run in normal `go test ./...` passes.
|
|
//
|
|
// Run them with:
|
|
//
|
|
// BROWSERLESS_URL=http://localhost:3030 \
|
|
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
|
// go test -v -tags integration -timeout 120s \
|
|
// github.com/libnovel/scraper/internal/browser
|
|
package browser_test
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/libnovel/scraper/internal/browser"
|
|
)
|
|
|
|
// chapterURL is the novelfire chapter used in every integration sub-test.
|
|
const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1"
|
|
|
|
// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
|
// environment and returns a configured contentClient.
|
|
// The test is skipped when BROWSERLESS_URL is not set.
|
|
func newIntegrationClient(t *testing.T) browser.BrowserClient {
|
|
t.Helper()
|
|
baseURL := os.Getenv("BROWSERLESS_URL")
|
|
if baseURL == "" {
|
|
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
|
}
|
|
return browser.NewContentClient(browser.Config{
|
|
BaseURL: baseURL,
|
|
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
|
// Use a generous per-request HTTP timeout so the wait-for-selector
|
|
// (75 s) doesn't get cut off by the transport layer.
|
|
Timeout: 120 * time.Second,
|
|
MaxConcurrent: 1,
|
|
})
|
|
}
|
|
|
|
// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content
|
|
// request with the production wait-for-selector settings succeeds and that the
|
|
// returned HTML contains the #content div expected on novelfire chapter pages.
|
|
func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) {
|
|
client := newIntegrationClient(t)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
|
defer cancel()
|
|
|
|
req := browser.ContentRequest{
|
|
URL: chapterURL,
|
|
WaitFor: &browser.WaitForSelector{
|
|
Selector: "#content",
|
|
Timeout: 5000,
|
|
},
|
|
RejectResourceTypes: productionRejectTypes(),
|
|
}
|
|
|
|
html, err := client.GetContent(ctx, req)
|
|
if err != nil {
|
|
t.Fatalf("GetContent failed: %v", err)
|
|
}
|
|
|
|
// The #content div must not be empty; presence of <p> tags inside it is a
|
|
// reliable indicator that chapter paragraphs were rendered.
|
|
contentIdx := strings.Index(html, `id="content"`)
|
|
if contentIdx == -1 {
|
|
t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html))
|
|
}
|
|
|
|
// Look for <p> tags after the #content marker — the chapter text lives there.
|
|
afterContent := html[contentIdx:]
|
|
if !strings.Contains(afterContent, "<p") {
|
|
t.Errorf("#content section contains no <p> tags; JS rendering may have failed.\nSection preview:\n%s",
|
|
truncate(afterContent, 1000))
|
|
}
|
|
|
|
t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html))
|
|
}
|
|
|
|
// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a
|
|
// deliberately too-short timeout returns an error containing "TimeoutError" (the
|
|
// Browserless error string seen in the failing log entry). This ensures our
|
|
// error-classification logic in retryGetContent matches real Browserless output.
|
|
func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) {
|
|
client := newIntegrationClient(t)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second)
|
|
defer cancel()
|
|
|
|
req := browser.ContentRequest{
|
|
URL: chapterURL,
|
|
WaitFor: &browser.WaitForSelector{
|
|
Selector: "#content",
|
|
Timeout: 500, // intentionally too short (500 ms) → Browserless will time out
|
|
},
|
|
RejectResourceTypes: productionRejectTypes(),
|
|
}
|
|
|
|
_, err := client.GetContent(ctx, req)
|
|
if err == nil {
|
|
t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " +
|
|
"the page may now load very fast; adjust the timeout threshold")
|
|
}
|
|
|
|
t.Logf("got expected error: %v", err)
|
|
|
|
// Browserless wraps navigation timeouts in a 500 response with
|
|
// "TimeoutError: Navigation timeout" in the body — this is the exact
|
|
// error that is triggering retries in production.
|
|
if !strings.Contains(err.Error(), "500") {
|
|
t.Errorf("expected HTTP 500 status in error, got: %v", err)
|
|
}
|
|
}
|
|
|
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
// productionRejectTypes returns the same resource-type block-list the
|
|
// novelfire scraper uses in production, so integration tests exercise the
|
|
// identical request shape.
|
|
func productionRejectTypes() []string {
|
|
return []string{
|
|
"cspviolationreport",
|
|
"eventsource",
|
|
"fedcm",
|
|
"font",
|
|
"image",
|
|
"manifest",
|
|
"media",
|
|
"other",
|
|
"ping",
|
|
"signedexchange",
|
|
"stylesheet",
|
|
"texttrack",
|
|
"websocket",
|
|
}
|
|
}
|
|
|
|
// truncate returns the first n bytes of s as a string.
|
|
func truncate(s string, n int) string {
|
|
if len(s) <= n {
|
|
return s
|
|
}
|
|
return s[:n] + "…"
|
|
}
|