Compare commits

...

2 Commits

Author SHA1 Message Date
Admin
801928aadf fix(scraper): update status and genres selectors for current novelfire.net HTML
Some checks failed
CI / Backend (push) Failing after 11s
CI / UI (push) Successful in 48s
Release / Test backend (push) Successful in 50s
Release / Check ui (push) Successful in 55s
CI / UI (pull_request) Successful in 37s
Release / Docker / caddy (push) Successful in 47s
CI / Backend (pull_request) Successful in 46s
Release / Docker / runner (push) Successful in 2m37s
Release / Docker / ui (push) Successful in 2m42s
Release / Docker / backend (push) Successful in 3m13s
Release / Gitea Release (push) Failing after 2s
novelfire.net changed its book page structure. Old selectors produced empty
status and null genres for every book, causing all Meilisearch filters to
return zero results.

Old → new:
- status:  <span class="status">  →  <strong class="ongoing|completed|hiatus">
  (text lowercased for consistent index values)
- genres:  <div class="genres"> <a>  →  <div class="categories"> <a class="property-item">
  (text lowercased for consistent index values)

Adds TestParseMetadataSelectors to guard against future regressions.
2026-03-28 22:54:35 +05:00
Admin
040072c3f5 docs(d2): update architecture and api-routing diagrams to current state
All checks were successful
CI / Backend (pull_request) Successful in 33s
CI / UI (pull_request) Successful in 40s
architecture.d2:
- Split app into prod VPS (165.22.70.138) and homelab runner (192.168.0.109)
- Add CrowdSec, Dozzle agent, pocket-tts (voice samples)
- Valkey now shown as Asynq job queue in addition to presign cache
- Add caddy-l4 Redis TCP proxy (:6380) to Caddy label
- Add CI/CD node (Gitea Actions) with full job list incl. releases.json bake
- Remove runner from prod app group (it runs on homelab only)
- Watchtower: note runner is label-disabled on prod

api-routing.d2:
- Add /api/presign/* routes to backend (presign_be group)
- Add /api/audio POST + status GET to both sk and be
- Add /api/scrape/book and /api/scrape/book/range to scrape_sk
- Catalogue: annotate Meilisearch vs legacy browse
- Add Meilisearch filter/sort fields to storage node
- Add Asynq queue note to Valkey storage node
- Fix presign proxy: sk routes through be.presign_be, not directly to storage
2026-03-28 22:44:31 +05:00
6 changed files with 393 additions and 291 deletions

View File

@@ -178,12 +178,26 @@ func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (domain.Bo
}
}
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
// Status: novelfire renders <strong class="ongoing">Ongoing</strong> (or
// "completed", "hiatus") inside the .header-stats block. We take the text
// content and lowercase it so the index value is always canonical lowercase.
var status string
for _, cls := range []string{"ongoing", "completed", "hiatus"} {
if v := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "strong", Class: cls}); v != "" {
status = strings.ToLower(strings.TrimSpace(v))
break
}
}
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
// Genres: novelfire renders <div class="categories"><ul><li><a class="property-item">Genre</a>
// Each <a class="property-item"> is one genre tag. Lowercase for index consistency.
var genres []string
if genresNode != nil {
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
if categoriesNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "categories"}); categoriesNode != nil {
for _, v := range htmlutil.ExtractAll(categoriesNode, scraper.Selector{Tag: "a", Class: "property-item", Multiple: true}) {
if v != "" {
genres = append(genres, strings.ToLower(strings.TrimSpace(v)))
}
}
}
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})

View File

@@ -2,6 +2,7 @@ package novelfire
import (
"context"
"log/slog"
"testing"
)
@@ -100,6 +101,56 @@ func TestRetryGet_EventualSuccess(t *testing.T) {
}
}
// TestParseMetadataSelectors verifies that the status and genres selectors
// match the current novelfire.net HTML structure.
func TestParseMetadataSelectors(t *testing.T) {
// Minimal HTML reproducing the relevant novelfire.net book page structure.
const html = `<!DOCTYPE html>
<html><body>
<h1 class="novel-title">Shadow Slave</h1>
<span class="author">Guiltythree</span>
<figure class="cover"><img src="https://cdn.example.com/cover.jpg"></figure>
<div class="header-stats">
<span><strong>123</strong><small>Chapters</small></span>
<span> <strong class="ongoing">Ongoing</strong> <small>Status</small></span>
</div>
<div class="categories">
<h4>Genres</h4>
<ul>
<li><a href="/genre-fantasy/..." class="property-item">Fantasy</a></li>
<li><a href="/genre-action/..." class="property-item">Action</a></li>
<li><a href="/genre-adventure/..." class="property-item">Adventure</a></li>
</ul>
</div>
<span class="chapter-count">123 Chapters</span>
</body></html>`
stub := newStubClient()
stub.setFn("https://novelfire.net/book/shadow-slave", func() (string, error) {
return html, nil
})
s := &Scraper{client: stub, log: slog.Default()}
meta, err := s.ScrapeMetadata(t.Context(), "https://novelfire.net/book/shadow-slave")
if err != nil {
t.Fatalf("ScrapeMetadata: %v", err)
}
if meta.Status != "ongoing" {
t.Errorf("status = %q, want %q", meta.Status, "ongoing")
}
wantGenres := []string{"fantasy", "action", "adventure"}
if len(meta.Genres) != len(wantGenres) {
t.Fatalf("genres = %v, want %v", meta.Genres, wantGenres)
}
for i, g := range meta.Genres {
if g != wantGenres[i] {
t.Errorf("genres[%d] = %q, want %q", i, g, wantGenres[i])
}
}
}
// ── minimal stub client for tests ─────────────────────────────────────────────
type stubClient struct {

View File

@@ -35,11 +35,11 @@ client: Browser / iOS App {
caddy: Caddy :443 {
shape: rectangle
style.fill: "#f1f5f9"
label: "Caddy :443\ncustom build · caddy-ratelimit\nsecurity headers · rate limiting\nstatic error pages"
label: "Caddy :443\ncustom build · caddy-l4 · caddy-ratelimit\nCrowdSec bouncer · security headers\nrate limiting · static error pages\nRedis TCP proxy :6380"
}
# ─── SvelteKit UI ─────────────────────────────────────────────────────────────
# Handles: auth enforcement, session, all /api/* routes that have SK counterparts
# All routes here pass through SvelteKit — auth is enforced server-side.
sk: SvelteKit UI :3000 {
style.fill: "#fef3c7"
@@ -53,7 +53,7 @@ sk: SvelteKit UI :3000 {
catalogue_sk: Catalogue {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /api/catalogue-page\nGET /api/search"
label: "GET /api/catalogue-page (infinite scroll)\nGET /api/search"
}
book_sk: Book {
@@ -65,7 +65,7 @@ sk: SvelteKit UI :3000 {
scrape_sk: Scrape (admin) {
style.fill: "#fff7ed"
style.stroke: "#fdba74"
label: "GET /api/scrape/status\nGET /api/scrape/tasks\nPOST /api/scrape\nPOST /api/scrape/range\nPOST /api/scrape/cancel/{id}"
label: "GET /api/scrape/status\nGET /api/scrape/tasks\nPOST /api/scrape\nPOST /api/scrape/book\nPOST /api/scrape/book/range\nPOST /api/scrape/cancel/{id}"
}
audio_sk: Audio {
@@ -74,7 +74,7 @@ sk: SvelteKit UI :3000 {
label: "POST /api/audio/{slug}/{n}\nGET /api/audio/status/{slug}/{n}\nGET /api/voices"
}
presign_sk: Presigned URLs {
presign_sk: Presigned URLs (public) {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /api/presign/chapter/{slug}/{n}\nGET /api/presign/audio/{slug}/{n}\nGET /api/presign/voice-sample/{voice}"
@@ -106,12 +106,12 @@ sk: SvelteKit UI :3000 {
}
# ─── Go Backend ───────────────────────────────────────────────────────────────
# Caddy proxies these paths directly — no SvelteKit auth layer
# Caddy proxies these paths directly — bypasses SvelteKit entirely.
be: Backend API :8080 {
style.fill: "#eef3ff"
health_be: Health {
health_be: Health / Version {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /health\nGET /api/version"
@@ -126,7 +126,7 @@ be: Backend API :8080 {
catalogue_be: Catalogue {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /api/browse\nGET /api/catalogue\nGET /api/ranking\nGET /api/cover/{domain}/{slug}"
label: "GET /api/catalogue (Meilisearch)\nGET /api/browse (legacy MinIO cache)\nGET /api/ranking\nGET /api/cover/{domain}/{slug}"
}
book_be: Book / Chapter {
@@ -138,7 +138,13 @@ be: Backend API :8080 {
audio_be: Audio {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /api/audio-proxy/{slug}/{n}\nGET /api/voices"
label: "POST /api/audio/{slug}/{n}\nGET /api/audio/status/{slug}/{n}\nGET /api/audio-proxy/{slug}/{n}\nGET /api/voices"
}
presign_be: Presigned URLs {
style.fill: "#f0fdf4"
style.stroke: "#86efac"
label: "GET /api/presign/chapter/{slug}/{n}\nGET /api/presign/audio/{slug}/{n}\nGET /api/presign/voice-sample/{voice}\nGET /api/presign/avatar-upload/{userId}\nGET /api/presign/avatar/{userId}"
}
}
@@ -149,19 +155,19 @@ storage: Storage {
pb: PocketBase :8090 {
shape: cylinder
label: "auth · books · progress\ncomments · library\nscrape_jobs · audio_cache"
label: "auth · books · progress\ncomments · library\nscrape_jobs · audio_cache\nranking"
}
mn: MinIO :9000 {
shape: cylinder
label: "chapters · audio\navatars · browse"
label: "chapters · audio\navatars · catalogue (browse)"
}
ms: Meilisearch :7700 {
shape: cylinder
label: "index: books"
label: "index: books\nfilterable: status · genres\nsortable: rank · rating\n total_chapters · meta_updated"
}
vk: Valkey :6379 {
shape: cylinder
label: "presign URL cache"
label: "presign URL cache (TTL ~55 min)\nAsynq job queue (runner)"
}
}
@@ -169,18 +175,17 @@ storage: Storage {
client -> caddy: HTTPS :443
caddy -> sk: "/* (catch-all)\n→ SvelteKit handles auth"
caddy -> be: "/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/* /api/chapter-markdown/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/* /api/catalogue /api/ranking"
caddy -> storage.mn: "/avatars/*\n/audio/*\n/chapters/*\n(presigned MinIO GETs)"
caddy -> sk: "/* (catch-all)\n→ SvelteKit enforces auth"
caddy -> be: "/health /scrape*\n/api/browse /api/catalogue /api/ranking\n/api/version /api/book-preview/*\n/api/chapter-text/* /api/chapter-markdown/*\n/api/reindex/* /api/cover/*\n/api/audio* /api/voices /api/presign/*"
caddy -> storage.mn: "/avatars/* /audio/* /chapters/*\n(presigned MinIO GETs)"
# ─── SvelteKit → Backend (server-side proxy) ──────────────────────────────────
sk.catalogue_sk -> be.catalogue_be: internal proxy
sk.book_sk -> be.book_be: internal proxy
sk.audio_sk -> be.audio_be: internal proxy
sk.presign_sk -> storage.vk: check cache
sk.presign_sk -> storage.mn: generate presign
sk.presign_user -> storage.mn: generate presign
sk.presign_sk -> be.presign_be: internal proxy
sk.presign_user -> be.presign_be: internal proxy
# ─── SvelteKit → Storage (direct) ────────────────────────────────────────────
@@ -192,10 +197,12 @@ sk.comments_sk -> storage.pb
# ─── Backend → Storage ────────────────────────────────────────────────────────
be.catalogue_be -> storage.ms: full-text search
be.catalogue_be -> storage.ms: full-text search + facets
be.catalogue_be -> storage.pb: ranking records
be.catalogue_be -> storage.mn: cover presign
be.book_be -> storage.mn: chapter objects
be.book_be -> storage.pb: book metadata
be.audio_be -> storage.mn: audio presign
be.audio_be -> storage.vk: presign cache
be.presign_be -> storage.vk: check / set presign cache
be.presign_be -> storage.mn: generate presigned URL

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 60 KiB

View File

@@ -5,16 +5,25 @@ direction: right
novelfire: novelfire.net {
shape: cloud
style.fill: "#f0f4ff"
label: "novelfire.net\n(scrape source)"
}
kokoro: Kokoro-FastAPI TTS {
shape: cloud
style.fill: "#f0f4ff"
label: "Kokoro-FastAPI TTS\n(self-hosted · homelab)\nchapter audio"
}
pockettts: pocket-tts {
shape: cloud
style.fill: "#f0f4ff"
label: "pocket-tts\n(self-hosted · homelab)\nvoice sample MP3s"
}
letsencrypt: Let's Encrypt {
shape: cloud
style.fill: "#f0f4ff"
label: "Let's Encrypt\n(ACME TLS-ALPN-01)"
}
browser: Browser / iOS App {
@@ -30,12 +39,12 @@ init: Init containers {
minio-init: minio-init {
shape: rectangle
label: "minio-init\n(mc: create buckets)"
label: "minio-init\n(mc: create buckets\n chapters · audio\n avatars · catalogue)"
}
pb-init: pb-init {
shape: rectangle
label: "pb-init\n(bootstrap collections)"
label: "pb-init\n(bootstrap PocketBase\n collections + schema)"
}
}
@@ -46,109 +55,126 @@ storage: Storage {
minio: MinIO {
shape: cylinder
label: "MinIO :9000\n\nbuckets:\n chapters\n audio\n avatars\n catalogue"
label: "MinIO :9000\nbuckets:\n chapters · audio\n avatars · catalogue"
}
pocketbase: PocketBase {
shape: cylinder
label: "PocketBase :8090\n\ncollections:\n books chapters_idx\n audio_cache progress\n scrape_jobs app_users\n ranking"
label: "PocketBase :8090\ncollections:\n books · chapters_idx\n audio_cache · progress\n scrape_jobs · app_users\n ranking · library\n comments"
}
valkey: Valkey {
shape: cylinder
label: "Valkey :6379\n\n(presign URL cache\nTTL-based, shared)"
label: "Valkey :6379\npresign URL cache (TTL ~55 min)\nAsynq job queue (runner tasks)"
}
meilisearch: Meilisearch {
shape: cylinder
label: "Meilisearch :7700\n\nindices:\n books"
label: "Meilisearch :7700\nindex: books\n(filterable: status · genres\n sortable: rank · rating\n total_chapters · meta_updated)"
}
}
# ─── Application ──────────────────────────────────────────────────────────────
# ─── Application — prod VPS (165.22.70.138) ───────────────────────────────────
app: Application {
app: Application — prod (165.22.70.138) {
style.fill: "#eef3ff"
caddy: caddy {
shape: rectangle
label: "Caddy :443 / :80\ncustom build + caddy-ratelimit\n\nfeatures:\n auto-HTTPS (Let's Encrypt)\n security headers\n rate limiting (per-IP)\n static error pages (502/503/504)"
label: "Caddy :443 / :80 / :6380\ncustom build\n+ caddy-l4 (Redis TCP proxy)\n+ caddy-ratelimit\nauto-HTTPS · security headers\nrate limiting (per-IP)\nstatic error pages (404/502/503/504)\nCrowdSec bouncer"
}
backend: backend {
shape: rectangle
label: "Backend API :8080\n(GoHTTP API server)"
}
runner: runner {
shape: rectangle
label: "Runner :9091\n(Go — background worker\nscraping + TTS jobs\n/metrics endpoint)"
label: "Backend API :8080\n(Go)\nHTTP API server\nffmpeg (audio sample conv.)\nOpenTelemetry tracing\nSentry / GlitchTip errors"
}
ui: ui {
shape: rectangle
label: "SvelteKit UI :3000\n(adapter-node)"
label: "SvelteKit UI :3000\n(adapter-node)\nSSR · session auth\nserver-side API proxy"
}
crowdsec: CrowdSec {
shape: rectangle
label: "CrowdSec :8080\nsecurity engine\nreads Caddy JSON logs\nbouncer integrated in Caddy"
}
dozzle: Dozzle agent {
shape: rectangle
label: "Dozzle agent\n127.0.0.1:7007\nlog relay → homelab dashboard"
}
}
# ─── Runner — homelab (192.168.0.109) ────────────────────────────────────────
homelab: Runner — homelab (192.168.0.109) {
style.fill: "#fef9ec"
runner: runner {
shape: rectangle
label: "Runner :9091\n(Go background worker)\nscrape pipeline\nTTS audio job queue\nPrometheus /metrics\ncron: catalogue refresh\nAsynq worker → Valkey"
}
}
# ─── Ops ──────────────────────────────────────────────────────────────────────
ops: Ops {
style.fill: "#fef9ec"
style.fill: "#f5f5f5"
watchtower: Watchtower {
shape: rectangle
label: "Watchtower\n(containrrr/watchtower)\n\npolls every 5 min\nautopulls + redeploys:\n backend · runner · ui"
label: "Watchtower\n(containrrr/watchtower)\npolls Docker Hub every 5 min\nautopulls + redeploys:\n backend · ui\n(runner: label-disabled on prod)"
}
}
# ─── Init → Storage deps ──────────────────────────────────────────────────────
# ─── CI / CD ──────────────────────────────────────────────────────────────────
cicd: CI / CD {
style.fill: "#f0f9ff"
gitea: Gitea Actions {
shape: rectangle
label: "Gitea Actions\n(homelab runner)\ntag v* trigger:\n test-backend\n check-ui (type-check + build)\n docker-backend\n docker-runner\n docker-ui (bakes releases.json)\n docker-caddy\n → push Docker Hub\n → Gitea Release"
}
}
# ─── Init → Storage ───────────────────────────────────────────────────────────
init.minio-init -> storage.minio: create buckets {style.stroke-dash: 4}
init.pb-init -> storage.pocketbase: bootstrap schema {style.stroke-dash: 4}
# ─── App → Storage ────────────────────────────────────────────────────────────
app.backend -> storage.minio: blobs (chapters, audio,\navatars, browse)
app.backend -> storage.pocketbase: structured records\n(books, progress, jobs…)
app.backend -> storage.valkey: cache presigned URLs\n(SET/GET with TTL)
app.runner -> storage.minio: write chapter markdown\n& audio MP3s
app.runner -> storage.pocketbase: read/update scrape jobs\nwrite book records
app.runner -> storage.meilisearch: index books on\nscrape completion
app.ui -> storage.valkey: read presigned URL cache
app.ui -> storage.pocketbase: auth, progress,\ncomments, settings
# ─── App internal ─────────────────────────────────────────────────────────────
app.ui -> app.backend: REST API calls (server-side)\n/api/catalogue /api/book-preview\n/api/chapter-text /api/audio etc.
app.caddy -> app.ui: "/* (catch-all)\nSvelteKit — auth enforced"
app.caddy -> app.backend: "/health /scrape*\n/api/browse /api/catalogue\n/api/ranking /api/version\n/api/book-preview/*\n/api/chapter-text/*\n/api/chapter-markdown/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/* /api/voices\n/api/audio* /api/presign/*"
app.caddy -> storage.minio: "/avatars/* /audio/*\n/chapters/*\n(presigned GETs)"
app.caddy -> app.crowdsec: bouncer check (15 s poll)
app.caddy -> letsencrypt: ACME cert (TLS-ALPN-01)
# ─── Caddy routing ────────────────────────────────────────────────────────────
# Routes sent directly to backend (no SvelteKit counterpart):
# /health /scrape*
# /api/browse /api/book-preview/* /api/chapter-text/*
# /api/reindex/* /api/cover/* /api/audio-proxy/*
# Routes sent to MinIO:
# /avatars/*
# Everything else → SvelteKit UI (including /api/scrape/*, /api/chapter-text-preview/*)
app.ui -> app.backend: "internal REST proxy\n(server-side only)"
app.ui -> storage.pocketbase: "auth · sessions\nprogress · library\ncomments"
app.caddy -> app.ui: "/* (catch-all)\n/api/scrape/*\n/api/chapter-text-preview/*\n→ SvelteKit (auth enforced)"
app.caddy -> app.backend: "/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/*"
app.caddy -> storage.minio: "/avatars/*\n/audio/*\n/chapters/*\n(presigned MinIO GETs)"
app.backend -> storage.minio: "chapter objs · audio MP3s\navatars · browse cache"
app.backend -> storage.pocketbase: "books · scrape_jobs\naudio_cache · ranking"
app.backend -> storage.valkey: "presign URL cache\n(SET/GET TTL ~55 min)"
app.backend -> storage.meilisearch: "catalogue search\nfacets: genres · status"
app.backend -> pockettts: "voice sample gen.\n(on-demand · ffmpeg conv.)"
# ─── External → App ───────────────────────────────────────────────────────────
# ─── Runner → deps ────────────────────────────────────────────────────────────
app.runner -> novelfire: scrape\n(HTTP GET)
app.runner -> kokoro: TTS generation\n(HTTP POST)
app.caddy -> letsencrypt: ACME certificate\n(TLS-ALPN-01)
homelab.runner -> novelfire: "HTTP scrape\nHTML → Markdown"
homelab.runner -> kokoro: "TTS generation\ntext → MP3"
homelab.runner -> storage.minio: "write chapters\n& audio MP3s"
homelab.runner -> storage.pocketbase: "read/update scrape_jobs\nwrite book records"
homelab.runner -> storage.meilisearch: "index books\n(on scrape completion)"
homelab.runner -> storage.valkey: "Asynq job queue\n(task consume)"
# ─── Ops → Docker socket ──────────────────────────────────────────────────────
ops.watchtower -> app.backend: watch (label-enabled)
ops.watchtower -> app.runner: watch (label-enabled)
ops.watchtower -> app.ui: watch (label-enabled)
# ─── Browser ──────────────────────────────────────────────────────────────────
# ─── Client ───────────────────────────────────────────────────────────────────
browser -> app.caddy: HTTPS :443\n(single entry point)
# ─── Ops / CI ─────────────────────────────────────────────────────────────────
ops.watchtower -> app.backend: watch (label-enabled)
ops.watchtower -> app.ui: watch (label-enabled)
cicd.gitea -> ops.watchtower: push to Docker Hub\n→ Watchtower detects new tag

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 58 KiB

After

Width:  |  Height:  |  Size: 65 KiB