Files
libnovel/scraper/internal/server/server.go
Admin 7879a51fe3 feat: add Kokoro TTS, ranking page, direct HTTP strategy, and chapter-number fix
- Add Kokoro-FastAPI TTS integration to the chapter reader UI:
  - Browser-side MSE streaming with paragraph-level click-to-start
  - Voice selector, speed slider, auto-next with prefetch of the next chapter
  - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text

- Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems
  in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes)
  with local-library annotation and one-click scrape buttons

- Add StrategyDirect (plain HTTP client) as a new browser strategy; the
  default strategy is now 'direct' for chapter fetching and 'content'
  for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY)

- Fix chapter numbering bug: numbers are now derived from the URL path
  (/chapter-N) rather than list position, correcting newest-first ordering

- Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved
  source_url without knowing the original URL

- Extend NovelScraper interface with RankingProvider (ScrapeRanking)

- Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions
  timeout set to 60 s, content/scrape client defaults raised to 90 s

- Add cover extraction fix (figure.cover > img rather than bare img.cover)

- Add AGENTS.md and .aiignore for AI tooling context

- Add integration tests for browser client and novelfire scraper (build
  tag: integration) and unit tests for chapterNumberFromURL and pagination
2026-03-01 12:25:16 +05:00

171 lines
4.9 KiB
Go

// Package server exposes the scraper as an HTTP service.
//
// Endpoints:
//
// POST /scrape — enqueue a full catalogue scrape
// POST /scrape/book — enqueue a single-book scrape (JSON body: {"url":"..."})
// GET /health — liveness probe
package server
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"strconv"
"sync"
"time"
"github.com/libnovel/scraper/internal/orchestrator"
"github.com/libnovel/scraper/internal/scraper"
"github.com/libnovel/scraper/internal/writer"
)
// Server wraps an HTTP mux with the scraping endpoints.
type Server struct {
addr string
oCfg orchestrator.Config
novel scraper.NovelScraper
log *slog.Logger
writer *writer.Writer
mu sync.Mutex
running bool
kokoroURL string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880
kokoroVoice string // default voice, e.g. af_bella
}
// New creates a new Server.
func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger, kokoroURL, kokoroVoice string) *Server {
return &Server{
addr: addr,
oCfg: oCfg,
novel: novel,
log: log,
writer: writer.New(oCfg.StaticRoot),
kokoroURL: kokoroURL,
kokoroVoice: kokoroVoice,
}
}
// ListenAndServe starts the HTTP server and blocks until the provided context
// is cancelled.
func (s *Server) ListenAndServe(ctx context.Context) error {
mux := http.NewServeMux()
mux.HandleFunc("GET /health", s.handleHealth)
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
// UI routes
mux.HandleFunc("GET /", s.handleHome)
mux.HandleFunc("GET /ranking", s.handleRanking)
mux.HandleFunc("POST /ranking/refresh", s.handleRankingRefresh)
mux.HandleFunc("GET /ranking/view", s.handleRankingView)
mux.HandleFunc("GET /books/{slug}", s.handleBook)
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
// Plain-text chapter content for browser-side TTS
mux.HandleFunc("GET /ui/chapter-text/{slug}/{n}", s.handleChapterText)
srv := &http.Server{
Addr: s.addr,
Handler: mux,
ReadTimeout: 15 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 60 * time.Second,
}
errCh := make(chan error, 1)
go func() { errCh <- srv.ListenAndServe() }()
s.log.Info("HTTP server listening", "addr", s.addr)
select {
case <-ctx.Done():
shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
return srv.Shutdown(shutCtx)
case err := <-errCh:
return err
}
}
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
}
// handleChapterText returns the plain text of a chapter (markdown stripped)
// for browser-side TTS. The browser POSTs this directly to Kokoro-FastAPI.
func (s *Server) handleChapterText(w http.ResponseWriter, r *http.Request) {
slug := r.PathValue("slug")
n, err := strconv.Atoi(r.PathValue("n"))
if err != nil || n < 1 {
http.NotFound(w, r)
return
}
raw, err := s.writer.ReadChapter(slug, n)
if err != nil {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
fmt.Fprint(w, stripMarkdown(raw))
}
func (s *Server) handleScrapeCatalogue(w http.ResponseWriter, r *http.Request) {
cfg := s.oCfg
cfg.SingleBookURL = "" // full catalogue
s.runAsync(w, cfg)
}
func (s *Server) handleScrapeBook(w http.ResponseWriter, r *http.Request) {
var body struct {
URL string `json:"url"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.URL == "" {
http.Error(w, `{"error":"request body must be JSON with \"url\" field"}`, http.StatusBadRequest)
return
}
cfg := s.oCfg
cfg.SingleBookURL = body.URL
s.runAsync(w, cfg)
}
// runAsync launches an orchestrator in the background and returns 202 Accepted.
// Only one scrape job runs at a time; concurrent requests receive 409 Conflict.
func (s *Server) runAsync(w http.ResponseWriter, cfg orchestrator.Config) {
s.mu.Lock()
if s.running {
s.mu.Unlock()
http.Error(w, `{"error":"a scrape job is already running"}`, http.StatusConflict)
return
}
s.running = true
s.mu.Unlock()
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusAccepted)
_ = json.NewEncoder(w).Encode(map[string]string{"status": "accepted"})
go func() {
defer func() {
s.mu.Lock()
s.running = false
s.mu.Unlock()
}()
ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour)
defer cancel()
o := orchestrator.New(cfg, s.novel, s.log)
if err := o.Run(ctx); err != nil {
s.log.Error("scrape job failed", "err", fmt.Sprintf("%v", err))
}
}()
}