- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
196 lines
5.5 KiB
Go
196 lines
5.5 KiB
Go
package browser
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
// Config holds the connection parameters for a Browserless instance.
|
|
type Config struct {
|
|
// BaseURL is the HTTP base URL, e.g. "http://localhost:3000".
|
|
BaseURL string
|
|
// Token is the optional API token (BROWSERLESS_TOKEN env var).
|
|
Token string
|
|
// Timeout is the per-request HTTP timeout; defaults to 60 s.
|
|
Timeout time.Duration
|
|
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
|
|
// Browserless. When all slots are occupied new calls block until one
|
|
// completes (or ctx is cancelled). 0 means no limit.
|
|
MaxConcurrent int
|
|
}
|
|
|
|
// makeSem returns a buffered channel used as a counting semaphore.
|
|
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
|
|
func makeSem(n int) chan struct{} {
|
|
if n <= 0 {
|
|
return nil
|
|
}
|
|
return make(chan struct{}, n)
|
|
}
|
|
|
|
// acquire takes one slot from sem. It returns an error if ctx is cancelled
|
|
// before a slot becomes available. If sem is nil it returns immediately.
|
|
func acquire(ctx context.Context, sem chan struct{}) error {
|
|
if sem == nil {
|
|
return nil
|
|
}
|
|
select {
|
|
case sem <- struct{}{}:
|
|
return nil
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
|
|
// release frees the slot previously obtained by acquire.
|
|
// If sem is nil it is a no-op.
|
|
func release(sem chan struct{}) {
|
|
if sem != nil {
|
|
<-sem
|
|
}
|
|
}
|
|
|
|
// contentClient implements BrowserClient using the /content endpoint.
|
|
type contentClient struct {
|
|
cfg Config
|
|
http *http.Client
|
|
sem chan struct{}
|
|
}
|
|
|
|
// NewContentClient returns a BrowserClient that uses POST /content.
|
|
func NewContentClient(cfg Config) BrowserClient {
|
|
if cfg.Timeout == 0 {
|
|
cfg.Timeout = 90 * time.Second
|
|
}
|
|
return &contentClient{
|
|
cfg: cfg,
|
|
http: &http.Client{Timeout: cfg.Timeout},
|
|
sem: makeSem(cfg.MaxConcurrent),
|
|
}
|
|
}
|
|
|
|
func (c *contentClient) Strategy() Strategy { return StrategyContent }
|
|
|
|
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
|
if err := acquire(ctx, c.sem); err != nil {
|
|
return "", fmt.Errorf("content: semaphore: %w", err)
|
|
}
|
|
defer release(c.sem)
|
|
|
|
body, err := json.Marshal(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: marshal request: %w", err)
|
|
}
|
|
|
|
url := c.cfg.BaseURL + "/content"
|
|
if c.cfg.Token != "" {
|
|
url += "?token=" + c.cfg.Token
|
|
}
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: build request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := c.http.Do(httpReq)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: do request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
b, _ := io.ReadAll(resp.Body)
|
|
return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
|
|
}
|
|
|
|
raw, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("content: read body: %w", err)
|
|
}
|
|
return string(raw), nil
|
|
}
|
|
|
|
func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
|
return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
|
|
}
|
|
|
|
func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
|
return fmt.Errorf("content client does not support CDP; use NewCDPClient")
|
|
}
|
|
|
|
// ─── /scrape client ───────────────────────────────────────────────────────────
|
|
|
|
type scrapeClient struct {
|
|
cfg Config
|
|
http *http.Client
|
|
sem chan struct{}
|
|
}
|
|
|
|
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
|
|
func NewScrapeClient(cfg Config) BrowserClient {
|
|
if cfg.Timeout == 0 {
|
|
cfg.Timeout = 90 * time.Second
|
|
}
|
|
return &scrapeClient{
|
|
cfg: cfg,
|
|
http: &http.Client{Timeout: cfg.Timeout},
|
|
sem: makeSem(cfg.MaxConcurrent),
|
|
}
|
|
}
|
|
|
|
func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }
|
|
|
|
func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
|
|
return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
|
|
}
|
|
|
|
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
|
|
if err := acquire(ctx, c.sem); err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
|
|
}
|
|
defer release(c.sem)
|
|
|
|
body, err := json.Marshal(req)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
|
|
}
|
|
|
|
url := c.cfg.BaseURL + "/scrape"
|
|
if c.cfg.Token != "" {
|
|
url += "?token=" + c.cfg.Token
|
|
}
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := c.http.Do(httpReq)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
b, _ := io.ReadAll(resp.Body)
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
|
|
}
|
|
|
|
var result ScrapeResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
|
return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
|
|
}
|