Files
libnovel/scraper/internal/browser/http.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

69 lines
1.9 KiB
Go

package browser
import (
"context"
"fmt"
"io"
"net/http"
"time"
)
type httpClient struct {
cfg Config
http *http.Client
sem chan struct{}
}
func NewDirectHTTPClient(cfg Config) BrowserClient {
if cfg.Timeout == 0 {
cfg.Timeout = 30 * time.Second
}
return &httpClient{
cfg: cfg,
http: &http.Client{Timeout: cfg.Timeout},
sem: makeSem(cfg.MaxConcurrent),
}
}
func (c *httpClient) Strategy() Strategy { return StrategyDirect }
func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
if err := acquire(ctx, c.sem); err != nil {
return "", fmt.Errorf("http: semaphore: %w", err)
}
defer release(c.sem)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, req.URL, nil)
if err != nil {
return "", fmt.Errorf("http: build request: %w", err)
}
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := c.http.Do(httpReq)
if err != nil {
return "", fmt.Errorf("http: do request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
b, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
}
raw, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("http: read body: %w", err)
}
return string(raw), nil
}
func (c *httpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
return ScrapeResponse{}, fmt.Errorf("http client does not support ScrapePage; use browserless")
}
func (c *httpClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
return fmt.Errorf("http client does not support CDP; use browserless")
}