- Add Kokoro-FastAPI TTS integration to the chapter reader UI: - Browser-side MSE streaming with paragraph-level click-to-start - Voice selector, speed slider, auto-next with prefetch of the next chapter - New GET /ui/chapter-text endpoint that strips Markdown and serves plain text - Add ranking page (novelfire /ranking scraper, WriteRanking/ReadRankingItems in writer, GET /ranking + POST /ranking/refresh + GET /ranking/view routes) with local-library annotation and one-click scrape buttons - Add StrategyDirect (plain HTTP client) as a new browser strategy; the default strategy is now 'direct' for chapter fetching and 'content' for chapter-list URL retrieval (split via BROWSERLESS_URL_STRATEGY) - Fix chapter numbering bug: numbers are now derived from the URL path (/chapter-N) rather than list position, correcting newest-first ordering - Add 'refresh <slug>' CLI sub-command to re-scrape a book from its saved source_url without knowing the original URL - Extend NovelScraper interface with RankingProvider (ScrapeRanking) - Tune scraper timeouts: wait-for-selector reduced to 5 s, GotoOptions timeout set to 60 s, content/scrape client defaults raised to 90 s - Add cover extraction fix (figure.cover > img rather than bare img.cover) - Add AGENTS.md and .aiignore for AI tooling context - Add integration tests for browser client and novelfire scraper (build tag: integration) and unit tests for chapterNumberFromURL and pagination
171 lines
4.9 KiB
Go
171 lines
4.9 KiB
Go
// Package server exposes the scraper as an HTTP service.
|
|
//
|
|
// Endpoints:
|
|
//
|
|
// POST /scrape — enqueue a full catalogue scrape
|
|
// POST /scrape/book — enqueue a single-book scrape (JSON body: {"url":"..."})
|
|
// GET /health — liveness probe
|
|
package server
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/libnovel/scraper/internal/orchestrator"
|
|
"github.com/libnovel/scraper/internal/scraper"
|
|
"github.com/libnovel/scraper/internal/writer"
|
|
)
|
|
|
|
// Server wraps an HTTP mux with the scraping endpoints.
|
|
type Server struct {
|
|
addr string
|
|
oCfg orchestrator.Config
|
|
novel scraper.NovelScraper
|
|
log *slog.Logger
|
|
writer *writer.Writer
|
|
mu sync.Mutex
|
|
running bool
|
|
kokoroURL string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880
|
|
kokoroVoice string // default voice, e.g. af_bella
|
|
}
|
|
|
|
// New creates a new Server.
|
|
func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger, kokoroURL, kokoroVoice string) *Server {
|
|
return &Server{
|
|
addr: addr,
|
|
oCfg: oCfg,
|
|
novel: novel,
|
|
log: log,
|
|
writer: writer.New(oCfg.StaticRoot),
|
|
kokoroURL: kokoroURL,
|
|
kokoroVoice: kokoroVoice,
|
|
}
|
|
}
|
|
|
|
// ListenAndServe starts the HTTP server and blocks until the provided context
|
|
// is cancelled.
|
|
func (s *Server) ListenAndServe(ctx context.Context) error {
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("GET /health", s.handleHealth)
|
|
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
|
|
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
|
|
// UI routes
|
|
mux.HandleFunc("GET /", s.handleHome)
|
|
mux.HandleFunc("GET /ranking", s.handleRanking)
|
|
mux.HandleFunc("POST /ranking/refresh", s.handleRankingRefresh)
|
|
mux.HandleFunc("GET /ranking/view", s.handleRankingView)
|
|
mux.HandleFunc("GET /books/{slug}", s.handleBook)
|
|
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
|
|
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
|
|
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
|
|
// Plain-text chapter content for browser-side TTS
|
|
mux.HandleFunc("GET /ui/chapter-text/{slug}/{n}", s.handleChapterText)
|
|
|
|
srv := &http.Server{
|
|
Addr: s.addr,
|
|
Handler: mux,
|
|
ReadTimeout: 15 * time.Second,
|
|
WriteTimeout: 60 * time.Second,
|
|
IdleTimeout: 60 * time.Second,
|
|
}
|
|
|
|
errCh := make(chan error, 1)
|
|
go func() { errCh <- srv.ListenAndServe() }()
|
|
|
|
s.log.Info("HTTP server listening", "addr", s.addr)
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
return srv.Shutdown(shutCtx)
|
|
case err := <-errCh:
|
|
return err
|
|
}
|
|
}
|
|
|
|
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
|
|
}
|
|
|
|
// handleChapterText returns the plain text of a chapter (markdown stripped)
|
|
// for browser-side TTS. The browser POSTs this directly to Kokoro-FastAPI.
|
|
func (s *Server) handleChapterText(w http.ResponseWriter, r *http.Request) {
|
|
slug := r.PathValue("slug")
|
|
n, err := strconv.Atoi(r.PathValue("n"))
|
|
if err != nil || n < 1 {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
raw, err := s.writer.ReadChapter(slug, n)
|
|
if err != nil {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
|
w.Header().Set("Cache-Control", "no-store")
|
|
fmt.Fprint(w, stripMarkdown(raw))
|
|
}
|
|
|
|
func (s *Server) handleScrapeCatalogue(w http.ResponseWriter, r *http.Request) {
|
|
cfg := s.oCfg
|
|
cfg.SingleBookURL = "" // full catalogue
|
|
|
|
s.runAsync(w, cfg)
|
|
}
|
|
|
|
func (s *Server) handleScrapeBook(w http.ResponseWriter, r *http.Request) {
|
|
var body struct {
|
|
URL string `json:"url"`
|
|
}
|
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.URL == "" {
|
|
http.Error(w, `{"error":"request body must be JSON with \"url\" field"}`, http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
cfg := s.oCfg
|
|
cfg.SingleBookURL = body.URL
|
|
|
|
s.runAsync(w, cfg)
|
|
}
|
|
|
|
// runAsync launches an orchestrator in the background and returns 202 Accepted.
|
|
// Only one scrape job runs at a time; concurrent requests receive 409 Conflict.
|
|
func (s *Server) runAsync(w http.ResponseWriter, cfg orchestrator.Config) {
|
|
s.mu.Lock()
|
|
if s.running {
|
|
s.mu.Unlock()
|
|
http.Error(w, `{"error":"a scrape job is already running"}`, http.StatusConflict)
|
|
return
|
|
}
|
|
s.running = true
|
|
s.mu.Unlock()
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusAccepted)
|
|
_ = json.NewEncoder(w).Encode(map[string]string{"status": "accepted"})
|
|
|
|
go func() {
|
|
defer func() {
|
|
s.mu.Lock()
|
|
s.running = false
|
|
s.mu.Unlock()
|
|
}()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour)
|
|
defer cancel()
|
|
|
|
o := orchestrator.New(cfg, s.novel, s.log)
|
|
if err := o.Run(ctx); err != nil {
|
|
s.log.Error("scrape job failed", "err", fmt.Sprintf("%v", err))
|
|
}
|
|
}()
|
|
}
|