Files
libnovel/scraper/internal/browser/integration_test.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

153 lines
4.9 KiB
Go

//go:build integration
// Integration tests for the Browserless /content API.
//
// These tests require a live Browserless instance and are gated behind the
// "integration" build tag so they never run in normal `go test ./...` passes.
//
// Run them with:
//
// BROWSERLESS_URL=http://localhost:3000 \
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
// go test -v -tags integration -timeout 120s \
// github.com/libnovel/scraper/internal/browser
package browser_test
import (
"context"
"os"
"strings"
"testing"
"time"
"github.com/libnovel/scraper/internal/browser"
)
// chapterURL is the novelfire chapter used in every integration sub-test.
const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1"
// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
// environment and returns a configured contentClient.
// The test is skipped when BROWSERLESS_URL is not set.
func newIntegrationClient(t *testing.T) browser.BrowserClient {
t.Helper()
baseURL := os.Getenv("BROWSERLESS_URL")
if baseURL == "" {
t.Skip("BROWSERLESS_URL not set — skipping integration test")
}
return browser.NewContentClient(browser.Config{
BaseURL: baseURL,
Token: os.Getenv("BROWSERLESS_TOKEN"),
// Use a generous per-request HTTP timeout so the wait-for-selector
// (75 s) doesn't get cut off by the transport layer.
Timeout: 120 * time.Second,
MaxConcurrent: 1,
})
}
// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content
// request with the production wait-for-selector settings succeeds and that the
// returned HTML contains the #content div expected on novelfire chapter pages.
func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) {
client := newIntegrationClient(t)
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
defer cancel()
req := browser.ContentRequest{
URL: chapterURL,
WaitFor: &browser.WaitForSelector{
Selector: "#content",
Timeout: 5000,
},
RejectResourceTypes: productionRejectTypes(),
}
html, err := client.GetContent(ctx, req)
if err != nil {
t.Fatalf("GetContent failed: %v", err)
}
// The #content div must not be empty; presence of <p> tags inside it is a
// reliable indicator that chapter paragraphs were rendered.
contentIdx := strings.Index(html, `id="content"`)
if contentIdx == -1 {
t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html))
}
// Look for <p> tags after the #content marker — the chapter text lives there.
afterContent := html[contentIdx:]
if !strings.Contains(afterContent, "<p") {
t.Errorf("#content section contains no <p> tags; JS rendering may have failed.\nSection preview:\n%s",
truncate(afterContent, 1000))
}
t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html))
}
// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a
// deliberately too-short timeout returns an error containing "TimeoutError" (the
// Browserless error string seen in the failing log entry). This ensures our
// error-classification logic in retryGetContent matches real Browserless output.
func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) {
client := newIntegrationClient(t)
ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second)
defer cancel()
req := browser.ContentRequest{
URL: chapterURL,
WaitFor: &browser.WaitForSelector{
Selector: "#content",
Timeout: 500, // intentionally too short (500 ms) → Browserless will time out
},
RejectResourceTypes: productionRejectTypes(),
}
_, err := client.GetContent(ctx, req)
if err == nil {
t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " +
"the page may now load very fast; adjust the timeout threshold")
}
t.Logf("got expected error: %v", err)
// Browserless wraps navigation timeouts in a 500 response with
// "TimeoutError: Navigation timeout" in the body — this is the exact
// error that is triggering retries in production.
if !strings.Contains(err.Error(), "500") {
t.Errorf("expected HTTP 500 status in error, got: %v", err)
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
// productionRejectTypes returns the same resource-type block-list the
// novelfire scraper uses in production, so integration tests exercise the
// identical request shape.
func productionRejectTypes() []string {
return []string{
"cspviolationreport",
"eventsource",
"fedcm",
"font",
"image",
"manifest",
"media",
"other",
"ping",
"signedexchange",
"stylesheet",
"texttrack",
"websocket",
}
}
// truncate returns the first n bytes of s as a string.
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}