libnovel/scraper/internal/browser/integration_test.go

//go:build integration

// Integration tests for the Browserless /content API.
//
// These tests require a live Browserless instance and are gated behind the
// "integration" build tag so they never run in normal `go test ./...` passes.
//
// Run them with:
//
//	BROWSERLESS_URL=http://localhost:3000 \
//	BROWSERLESS_TOKEN=your-token \          # omit if auth is disabled
//	go test -v -tags integration -timeout 120s \
//	    github.com/libnovel/scraper/internal/browser
package browser_test

import (
	"context"
	"os"
	"strings"
	"testing"
	"time"

	"github.com/libnovel/scraper/internal/browser"
)

// chapterURL is the novelfire chapter used in every integration sub-test.
const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1"

// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
// environment and returns a configured contentClient.
// The test is skipped when BROWSERLESS_URL is not set.
func newIntegrationClient(t *testing.T) browser.BrowserClient {
	t.Helper()
	baseURL := os.Getenv("BROWSERLESS_URL")
	if baseURL == "" {
		t.Skip("BROWSERLESS_URL not set — skipping integration test")
	}
	return browser.NewContentClient(browser.Config{
		BaseURL: baseURL,
		Token:   os.Getenv("BROWSERLESS_TOKEN"),
		// Use a generous per-request HTTP timeout so the wait-for-selector
		// (75 s) doesn't get cut off by the transport layer.
		Timeout:       120 * time.Second,
		MaxConcurrent: 1,
	})
}

// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content
// request with the production wait-for-selector settings succeeds and that the
// returned HTML contains the #content div expected on novelfire chapter pages.
func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) {
	client := newIntegrationClient(t)

	ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
	defer cancel()

	req := browser.ContentRequest{
		URL: chapterURL,
		WaitFor: &browser.WaitForSelector{
			Selector: "#content",
			Timeout:  5000,
		},
		RejectResourceTypes: productionRejectTypes(),
	}

	html, err := client.GetContent(ctx, req)
	if err != nil {
		t.Fatalf("GetContent failed: %v", err)
	}

	// The #content div must not be empty; presence of <p> tags inside it is a
	// reliable indicator that chapter paragraphs were rendered.
	contentIdx := strings.Index(html, `id="content"`)
	if contentIdx == -1 {
		t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html))
	}

	// Look for <p> tags after the #content marker — the chapter text lives there.
	afterContent := html[contentIdx:]
	if !strings.Contains(afterContent, "<p") {
		t.Errorf("#content section contains no <p> tags; JS rendering may have failed.\nSection preview:\n%s",
			truncate(afterContent, 1000))
	}

	t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html))
}

// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a
// deliberately too-short timeout returns an error containing "TimeoutError" (the
// Browserless error string seen in the failing log entry).  This ensures our
// error-classification logic in retryGetContent matches real Browserless output.
func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) {
	client := newIntegrationClient(t)

	ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second)
	defer cancel()

	req := browser.ContentRequest{
		URL: chapterURL,
		WaitFor: &browser.WaitForSelector{
			Selector: "#content",
			Timeout:  500, // intentionally too short (500 ms) → Browserless will time out
		},
		RejectResourceTypes: productionRejectTypes(),
	}

	_, err := client.GetContent(ctx, req)
	if err == nil {
		t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " +
			"the page may now load very fast; adjust the timeout threshold")
	}

	t.Logf("got expected error: %v", err)

	// Browserless wraps navigation timeouts in a 500 response with
	// "TimeoutError: Navigation timeout" in the body — this is the exact
	// error that is triggering retries in production.
	if !strings.Contains(err.Error(), "500") {
		t.Errorf("expected HTTP 500 status in error, got: %v", err)
	}
}

// ── helpers ───────────────────────────────────────────────────────────────────

// productionRejectTypes returns the same resource-type block-list the
// novelfire scraper uses in production, so integration tests exercise the
// identical request shape.
func productionRejectTypes() []string {
	return []string{
		"cspviolationreport",
		"eventsource",
		"fedcm",
		"font",
		"image",
		"manifest",
		"media",
		"other",
		"ping",
		"signedexchange",
		"stylesheet",
		"texttrack",
		"websocket",
	}
}

// truncate returns the first n bytes of s as a string.
func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}