Compare commits
245 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6592d1662c | ||
|
|
59e8cdb19a | ||
|
|
1118392811 | ||
|
|
baa403efa2 | ||
|
|
0ed1112b20 | ||
|
|
16a12ede4d | ||
|
|
b9b69cee44 | ||
|
|
5b27d501af | ||
|
|
a85636d5db | ||
|
|
29d0eeb7e8 | ||
|
|
fabe9724c2 | ||
|
|
4c9bb4adde | ||
|
|
22b6ee824e | ||
|
|
3918bc8dc3 | ||
|
|
5825b859b7 | ||
|
|
1642434a79 | ||
|
|
02705dc6ed | ||
|
|
7413313100 | ||
|
|
b11f4ab6b4 | ||
|
|
3e4b1c0484 | ||
|
|
b5bc6ff3de | ||
|
|
8d4bba7964 | ||
|
|
2e5fe54615 | ||
|
|
81265510ef | ||
|
|
4d3c093612 | ||
|
|
937ba052fc | ||
|
|
479d201da9 | ||
|
|
1242cc7eb3 | ||
|
|
0b6dbeb042 | ||
|
|
c06877069f | ||
|
|
261c738fc0 | ||
|
|
5528abe4b0 | ||
|
|
09cdda2a07 | ||
|
|
718bfa6691 | ||
|
|
e11e866e27 | ||
|
|
23345e22e6 | ||
|
|
c7b3495a23 | ||
|
|
83a5910a59 | ||
|
|
0f6639aae7 | ||
|
|
88a25bc33e | ||
|
|
73ad4ece49 | ||
|
|
52f876d8e8 | ||
|
|
72eed89f59 | ||
|
|
12bb0db5f0 | ||
|
|
5ec1773768 | ||
|
|
fb8f1dfe25 | ||
|
|
3a2d113b1b | ||
|
|
0dcfdff65b | ||
|
|
1766011b47 | ||
|
|
a6f800b0d7 | ||
|
|
af9639af05 | ||
|
|
bfc08a2df2 | ||
|
|
dc3bc3ebf2 | ||
|
|
e9d7293d37 | ||
|
|
410af8f236 | ||
|
|
264c00c765 | ||
|
|
e4c72011eb | ||
|
|
6365b14ece | ||
|
|
7da5582075 | ||
|
|
dae841e317 | ||
|
|
16b2bfffa6 | ||
|
|
57be674f44 | ||
|
|
93390fab64 | ||
|
|
072517135f | ||
|
|
fe7c7acbb7 | ||
|
|
d4cce915d9 | ||
|
|
ac24e86f7d | ||
|
|
e9bb387f71 | ||
|
|
d7319b3f7c | ||
|
|
f380c85815 | ||
|
|
9d1b340b83 | ||
|
|
a307ddc9f5 | ||
|
|
004d1b6d9d | ||
|
|
7f20411f50 | ||
|
|
6e6c581904 | ||
|
|
cecedc8687 | ||
|
|
a88e98a436 | ||
|
|
d3ae86d55b | ||
|
|
5ad5c2dbce | ||
|
|
0de91dcc0c | ||
|
|
8e3e9ef31d | ||
|
|
3c5edd5742 | ||
|
|
2142e82fe4 | ||
|
|
88cde88f69 | ||
|
|
ffcc3981f2 | ||
|
|
a7b4694e60 | ||
|
|
8c895c6ba1 | ||
|
|
83059c8a9d | ||
|
|
b54ebf60b5 | ||
|
|
e027afe89d | ||
|
|
9fc2054e36 | ||
|
|
9a43b2190e | ||
|
|
5a7d7ce3b9 | ||
|
|
ce3eef1298 | ||
|
|
5d9b41bcf2 | ||
|
|
47268dea67 | ||
|
|
57591766f2 | ||
|
|
fa8fb96631 | ||
|
|
5ba84f7945 | ||
|
|
2793ad8cfa | ||
|
|
e43699747d | ||
|
|
1e85f1c0bc | ||
|
|
0c2349f259 | ||
|
|
c9252b5953 | ||
|
|
7efeee3fc2 | ||
|
|
9a05708019 | ||
|
|
24cb18e0fe | ||
|
|
71ba882858 | ||
|
|
c35f099f50 | ||
|
|
4df287ace4 | ||
|
|
0df45de2b6 | ||
|
|
825fb04c0d | ||
|
|
fc5cd30c93 | ||
|
|
37bd73651a | ||
|
|
466e289b68 | ||
|
|
bb604019fc | ||
|
|
0745178d9e | ||
|
|
603cd2bb02 | ||
|
|
228d4902bb | ||
|
|
884c82b2c3 | ||
|
|
c6536d5b9f | ||
|
|
460e7553bf | ||
|
|
89f0dfb113 | ||
|
|
88644341d8 | ||
|
|
992eb823f2 | ||
|
|
f51113a2f8 | ||
|
|
1eb70e9b9b | ||
|
|
70dd14e5c8 | ||
|
|
8096827c78 | ||
|
|
669fd765ee | ||
|
|
314af375d5 | ||
|
|
20c45e2676 | ||
|
|
09981a5f4d | ||
|
|
de9e0b4246 | ||
|
|
a72c1f6b52 | ||
|
|
5d3a1a09ef | ||
|
|
39ad0d6c11 | ||
|
|
765b37aea3 | ||
|
|
aff6de9b45 | ||
|
|
ec66e86a18 | ||
|
|
9b7cdad71a | ||
|
|
8f0a2f7e92 | ||
|
|
08d4718245 | ||
|
|
60a9540ef7 | ||
|
|
76d616a308 | ||
|
|
e723459507 | ||
|
|
b3358ac1d2 | ||
|
|
c0d33720e9 | ||
|
|
a5c603e7a6 | ||
|
|
219d4fb214 | ||
|
|
cec0dfe64a | ||
|
|
54616b82d7 | ||
|
|
ce5db37226 | ||
|
|
60bc8e5749 | ||
|
|
b4be0803aa | ||
|
|
12eca865ce | ||
|
|
589f39b49e | ||
|
|
53083429a0 | ||
|
|
70c8db28f9 | ||
|
|
1d00fd4e2e | ||
|
|
a54d8d43aa | ||
|
|
97e7a8dc02 | ||
|
|
fb6b364382 | ||
|
|
7b48707cd9 | ||
|
|
b0547c1b43 | ||
|
|
acbfafb8cd | ||
|
|
c8e0cf2813 | ||
|
|
3899a96576 | ||
|
|
1e7f396b2d | ||
|
|
0eee2eedf3 | ||
|
|
80da1bb3e2 | ||
|
|
9f3e895fa8 | ||
|
|
cf0c0dfaaf | ||
|
|
0402c408e4 | ||
|
|
d14644238f | ||
|
|
8de374cd35 | ||
|
|
82186cfd6d | ||
|
|
b87e758303 | ||
|
|
901b18ee13 | ||
|
|
034e670795 | ||
|
|
0d7b985469 | ||
|
|
53af7515a3 | ||
|
|
11a846d043 | ||
|
|
bf2ffa54db | ||
|
|
fe204598a2 | ||
|
|
9906c7d862 | ||
|
|
06feb91f4f | ||
|
|
5a7751e6d1 | ||
|
|
555973c053 | ||
|
|
c2d6ce1c5b | ||
|
|
8edad54b10 | ||
|
|
48d8fdb6b9 | ||
|
|
1b05b6ebc6 | ||
|
|
cabdd3ffdd | ||
|
|
f80b83309a | ||
|
|
49ba2c27c2 | ||
|
|
353d7397eb | ||
|
|
89ff90629f | ||
|
|
f6febfdb5e | ||
|
|
2c43907e34 | ||
|
|
0e868506ca | ||
|
|
1b234754e8 | ||
|
|
041099598b | ||
|
|
333c8ad868 | ||
|
|
d16ae00537 | ||
|
|
d16313bb6c | ||
|
|
1bab7028c6 | ||
|
|
6520fb9a50 | ||
|
|
7acf04fb9f | ||
|
|
c2bcb2b0a6 | ||
|
|
cfd893d24b | ||
|
|
cff0c78b4f | ||
|
|
d89cefe975 | ||
|
|
a0344b36d7 | ||
|
|
af3c487afb | ||
|
|
b8d4d94b18 | ||
|
|
56bf4dde22 | ||
|
|
2f0857be45 | ||
|
|
bf5774d8d0 | ||
|
|
5131ae0bc4 | ||
|
|
9fa0776258 | ||
|
|
f265d9d020 | ||
|
|
3c26dfe2c0 | ||
|
|
1820fa7303 | ||
|
|
38e400a4c7 | ||
|
|
cb90771248 | ||
|
|
59b1cfab1d | ||
|
|
f95ad3ed29 | ||
|
|
e4c4f8de66 | ||
|
|
4f84bd29c9 | ||
|
|
6bf79ab392 | ||
|
|
4ae6f0ab42 | ||
|
|
33e2a4dc01 | ||
|
|
cb4be0848f | ||
|
|
2f948f2a50 | ||
|
|
baab66823d | ||
|
|
11d2eaa0e5 | ||
|
|
9c115f00c4 | ||
|
|
5ac89da513 | ||
|
|
af86c6f96f | ||
|
|
da4a182f85 | ||
|
|
18e76c9668 | ||
|
|
9add9033b9 | ||
|
|
66d8481637 | ||
|
|
7f92a58fd7 |
41
.env.example
41
.env.example
@@ -1,41 +0,0 @@
|
||||
# libnovel scraper — environment overrides
|
||||
# Copy to .env and adjust values; do NOT commit this file with real secrets.
|
||||
|
||||
# Browserless API token (leave empty to disable auth)
|
||||
BROWSERLESS_TOKEN=
|
||||
|
||||
# Number of concurrent browser sessions in Browserless
|
||||
BROWSERLESS_CONCURRENT=10
|
||||
|
||||
# Queue depth before Browserless returns 429
|
||||
BROWSERLESS_QUEUED=100
|
||||
|
||||
# Per-session timeout in ms
|
||||
BROWSERLESS_TIMEOUT=60000
|
||||
|
||||
# Optional webhook URL for Browserless error alerts (leave empty to disable)
|
||||
ERROR_ALERT_URL=
|
||||
|
||||
# Which Browserless strategy the scraper uses: content | scrape | cdp | direct
|
||||
BROWSERLESS_STRATEGY=direct
|
||||
|
||||
# Strategy for URL retrieval (chapter list). Uses browserless content strategy by default.
|
||||
# Set to direct to use plain HTTP, or content/scrape/cdp for browserless.
|
||||
BROWSERLESS_URL_STRATEGY=content
|
||||
|
||||
# Chapter worker goroutines (0 = NumCPU inside the container)
|
||||
SCRAPER_WORKERS=0
|
||||
|
||||
# Host path to mount as the static output directory
|
||||
STATIC_ROOT=./static/books
|
||||
|
||||
# ── Kokoro-FastAPI TTS ────────────────────────────────────────────────────────
|
||||
# Base URL for the Kokoro-FastAPI service. When running via docker-compose the
|
||||
# default (http://kokoro:8880) is wired in automatically; override here only if
|
||||
# you are pointing at an external or GPU instance.
|
||||
KOKORO_URL=http://kokoro:8880
|
||||
|
||||
# Default voice used for chapter narration.
|
||||
# Single voices: af_bella, af_sky, af_heart, am_adam, …
|
||||
# Mixed voices: af_bella+af_sky or af_bella(2)+af_sky(1) (weighted blend)
|
||||
KOKORO_VOICE=af_bella
|
||||
191
.gitea/workflows/ci-v3.yaml
Normal file
191
.gitea/workflows/ci-v3.yaml
Normal file
@@ -0,0 +1,191 @@
|
||||
name: CI / v3
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main", "master"]
|
||||
paths:
|
||||
- "backend/**"
|
||||
- "ui/**"
|
||||
- "caddy/**"
|
||||
- "docker-compose.yml"
|
||||
- ".gitea/workflows/ci-v3.yaml"
|
||||
pull_request:
|
||||
branches: ["main", "master"]
|
||||
paths:
|
||||
- "backend/**"
|
||||
- "ui/**"
|
||||
- "caddy/**"
|
||||
- "docker-compose.yml"
|
||||
- ".gitea/workflows/ci-v3.yaml"
|
||||
|
||||
concurrency:
|
||||
group: ${{ gitea.workflow }}-${{ gitea.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
# ── backend: vet & test ───────────────────────────────────────────────────────
|
||||
test-backend:
|
||||
name: Test backend
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: backend/go.mod
|
||||
cache-dependency-path: backend/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: backend
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run tests
|
||||
working-directory: backend
|
||||
run: go test -short -race -count=1 -timeout=60s ./...
|
||||
|
||||
# ── ui: type-check & build ────────────────────────────────────────────────────
|
||||
check-ui:
|
||||
name: Check ui
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ui
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: npm
|
||||
cache-dependency-path: ui/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Type check
|
||||
run: npm run check
|
||||
|
||||
- name: Build
|
||||
run: npm run build
|
||||
|
||||
# ── docker: backend ───────────────────────────────────────────────────────────
|
||||
docker-backend:
|
||||
name: Docker / backend
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test-backend]
|
||||
if: gitea.event_name == 'push'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend
|
||||
target: backend
|
||||
push: true
|
||||
tags: |
|
||||
${{ secrets.DOCKER_USER }}/libnovel-backend:latest
|
||||
${{ secrets.DOCKER_USER }}/libnovel-backend:${{ gitea.sha }}
|
||||
build-args: |
|
||||
VERSION=${{ gitea.sha }}
|
||||
COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-backend:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: runner ────────────────────────────────────────────────────────────
|
||||
docker-runner:
|
||||
name: Docker / runner
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test-backend]
|
||||
if: gitea.event_name == 'push'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend
|
||||
target: runner
|
||||
push: true
|
||||
tags: |
|
||||
${{ secrets.DOCKER_USER }}/libnovel-runner:latest
|
||||
${{ secrets.DOCKER_USER }}/libnovel-runner:${{ gitea.sha }}
|
||||
build-args: |
|
||||
VERSION=${{ gitea.sha }}
|
||||
COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-runner:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: ui ────────────────────────────────────────────────────────────────
|
||||
docker-ui:
|
||||
name: Docker / ui
|
||||
runs-on: ubuntu-latest
|
||||
needs: [check-ui]
|
||||
if: gitea.event_name == 'push'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: ui
|
||||
push: true
|
||||
tags: |
|
||||
${{ secrets.DOCKER_USER }}/libnovel-ui:latest
|
||||
${{ secrets.DOCKER_USER }}/libnovel-ui:${{ gitea.sha }}
|
||||
build-args: |
|
||||
BUILD_VERSION=${{ gitea.sha }}
|
||||
BUILD_COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-ui:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: caddy ─────────────────────────────────────────────────────────────
|
||||
docker-caddy:
|
||||
name: Docker / caddy
|
||||
runs-on: ubuntu-latest
|
||||
if: gitea.event_name == 'push'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: caddy
|
||||
push: true
|
||||
tags: |
|
||||
${{ secrets.DOCKER_USER }}/libnovel-caddy:latest
|
||||
${{ secrets.DOCKER_USER }}/libnovel-caddy:${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-caddy:latest
|
||||
cache-to: type=inline
|
||||
@@ -1,104 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main", "master"]
|
||||
paths:
|
||||
- "scraper/**"
|
||||
- ".gitea/workflows/**"
|
||||
pull_request:
|
||||
branches: ["main", "master"]
|
||||
paths:
|
||||
- "scraper/**"
|
||||
- ".gitea/workflows/**"
|
||||
|
||||
defaults:
|
||||
run:
|
||||
working-directory: scraper
|
||||
|
||||
jobs:
|
||||
# ── lint & vet ───────────────────────────────────────────────────────────────
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: scraper/go.mod
|
||||
cache-dependency-path: scraper/go.sum
|
||||
|
||||
- name: go vet
|
||||
run: go vet ./...
|
||||
|
||||
- name: staticcheck
|
||||
run: go tool staticcheck ./...
|
||||
|
||||
# ── tests ────────────────────────────────────────────────────────────────────
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: scraper/go.mod
|
||||
cache-dependency-path: scraper/go.sum
|
||||
|
||||
- name: Run tests
|
||||
run: go test -race -count=1 -timeout=60s ./...
|
||||
|
||||
# ── build binary ─────────────────────────────────────────────────────────────
|
||||
build:
|
||||
name: Build
|
||||
runs-on: ubuntu-latest
|
||||
needs: [lint, test]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: scraper/go.mod
|
||||
cache-dependency-path: scraper/go.sum
|
||||
|
||||
- name: Build binary
|
||||
run: |
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
|
||||
go build -ldflags="-s -w" -o bin/scraper ./cmd/scraper
|
||||
|
||||
- name: Upload binary artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: scraper-linux-amd64
|
||||
path: scraper/bin/scraper
|
||||
retention-days: 7
|
||||
|
||||
# ── docker build (& push) ────────────────────────────────────────────────────
|
||||
# Uncomment once the runner has Docker available and a registry is configured.
|
||||
#
|
||||
# docker:
|
||||
# name: Docker
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [lint, test]
|
||||
# # Only push images on commits to the default branch, not on PRs.
|
||||
# # if: github.event_name == 'push'
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Log in to Gitea registry
|
||||
# uses: docker/login-action@v3
|
||||
# with:
|
||||
# registry: gitea.kalekber.cc
|
||||
# username: ${{ secrets.REGISTRY_USER }}
|
||||
# password: ${{ secrets.REGISTRY_TOKEN }}
|
||||
#
|
||||
# - name: Build and push
|
||||
# uses: docker/build-push-action@v5
|
||||
# with:
|
||||
# context: ./scraper
|
||||
# push: true
|
||||
# tags: |
|
||||
# gitea.kalekber.cc/kamil/libnovel:latest
|
||||
# gitea.kalekber.cc/kamil/libnovel:${{ gitea.sha }}
|
||||
226
.gitea/workflows/release-v3.yaml
Normal file
226
.gitea/workflows/release-v3.yaml
Normal file
@@ -0,0 +1,226 @@
|
||||
name: Release / v3
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*" # e.g. v1.0.0, v1.2.3
|
||||
|
||||
concurrency:
|
||||
group: ${{ gitea.workflow }}-${{ gitea.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
# ── backend: vet & test ───────────────────────────────────────────────────────
|
||||
test-backend:
|
||||
name: Test backend
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: backend/go.mod
|
||||
cache-dependency-path: backend/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: backend
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run tests
|
||||
working-directory: backend
|
||||
run: go test -short -race -count=1 -timeout=60s ./...
|
||||
|
||||
# ── ui: type-check & build ────────────────────────────────────────────────────
|
||||
check-ui:
|
||||
name: Check ui
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ui
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: npm
|
||||
cache-dependency-path: ui/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Type check
|
||||
run: npm run check
|
||||
|
||||
- name: Build
|
||||
run: npm run build
|
||||
|
||||
# ── docker: backend ───────────────────────────────────────────────────────────
|
||||
docker-backend:
|
||||
name: Docker / backend
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test-backend]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKER_USER }}/libnovel-backend
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend
|
||||
target: backend
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.meta.outputs.version }}
|
||||
COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-backend:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: runner ────────────────────────────────────────────────────────────
|
||||
docker-runner:
|
||||
name: Docker / runner
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test-backend]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKER_USER }}/libnovel-runner
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend
|
||||
target: runner
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.meta.outputs.version }}
|
||||
COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-runner:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: ui ────────────────────────────────────────────────────────────────
|
||||
docker-ui:
|
||||
name: Docker / ui
|
||||
runs-on: ubuntu-latest
|
||||
needs: [check-ui]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKER_USER }}/libnovel-ui
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: ui
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
BUILD_VERSION=${{ steps.meta.outputs.version }}
|
||||
BUILD_COMMIT=${{ gitea.sha }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-ui:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── docker: caddy ─────────────────────────────────────────────────────────────
|
||||
docker-caddy:
|
||||
name: Docker / caddy
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKER_USER }}/libnovel-caddy
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: caddy
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKER_USER }}/libnovel-caddy:latest
|
||||
cache-to: type=inline
|
||||
|
||||
# ── Gitea release ─────────────────────────────────────────────────────────────
|
||||
release:
|
||||
name: Gitea Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [docker-backend, docker-runner, docker-ui, docker-caddy]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Create release
|
||||
uses: actions/gitea-release-action@v1
|
||||
with:
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
generate_release_notes: true
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -5,15 +5,16 @@
|
||||
/dist/
|
||||
|
||||
# ── Compiled binaries ──────────────────────────────────────────────────────────
|
||||
scraper/bin/
|
||||
backend/bin/
|
||||
|
||||
# ── Scraped output (large, machine-generated) ──────────────────────────────────
|
||||
|
||||
/static/books
|
||||
# ── Environment & secrets ──────────────────────────────────────────────────────
|
||||
# Secrets are managed by Doppler — never commit .env files.
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
.env.local
|
||||
|
||||
# ── CrowdSec — generated bouncer API key ──────────────────────────────────────
|
||||
crowdsec/.crowdsec.env
|
||||
|
||||
# ── OS artefacts ───────────────────────────────────────────────────────────────
|
||||
.DS_Store
|
||||
|
||||
156
.opencode/skills/ios-ux/SKILL.md
Normal file
156
.opencode/skills/ios-ux/SKILL.md
Normal file
@@ -0,0 +1,156 @@
|
||||
---
|
||||
name: ios-ux
|
||||
description: iOS/SwiftUI UI & UX review and implementation guidelines for LibNovel. Enforces Apple HIG, iOS 17+ APIs, spring animations, haptics, accessibility, performance, and offline handling. Load this skill for any iOS view work.
|
||||
compatibility: opencode
|
||||
---
|
||||
|
||||
# iOS UI/UX Skill — LibNovel
|
||||
|
||||
Load this skill whenever working on SwiftUI views in `ios/`. It defines design standards, review process for screenshots, and implementation rules.
|
||||
|
||||
---
|
||||
|
||||
## Screenshot Review Process
|
||||
|
||||
When the user provides a screenshot of the app:
|
||||
|
||||
1. **Analyze first** — identify specific UI/UX issues across these categories:
|
||||
- Visual hierarchy and spacing
|
||||
- Typography (size, weight, contrast)
|
||||
- Color and material usage
|
||||
- Animation and interactivity gaps
|
||||
- Accessibility problems
|
||||
- Deprecated or non-native patterns
|
||||
2. **Present a numbered list** of suggested improvements with brief rationale for each.
|
||||
3. **Ask for confirmation** before writing any code: "Should I apply all of these, or only specific ones?"
|
||||
4. Apply only what the user confirms.
|
||||
|
||||
---
|
||||
|
||||
## Design System
|
||||
|
||||
### Colors & Materials
|
||||
- **Accent**: `Color.amber` (project-defined). Use for active state, selection indicators, progress fills, and CTAs.
|
||||
- **Backgrounds**: Prefer `.regularMaterial`, `.ultraThinMaterial`, or `.thinMaterial` over hard-coded `Color.black.opacity(x)` or `Color(.systemBackground)`.
|
||||
- **Dark overlays** (e.g. full-screen players): Use `KFImage` blurred background + `Color.black.opacity(0.5–0.6)` overlay. Never use a flat solid black background.
|
||||
- **Semantic colors**: Use `.primary`, `.secondary`, `.tertiary` foreground styles. Avoid hard-coded `Color.white` except on dark material contexts (full-screen player).
|
||||
- **No hardcoded color literals** — use `Color+App.swift` extensions or system semantic colors.
|
||||
|
||||
### Typography
|
||||
- Use the SF Pro system font via `.font(.title)`, `.font(.body)`, etc. — never hardcode font names except for intentional stylistic accents (e.g. "Snell Roundhand" for voice watermark).
|
||||
- Apply `.fontWeight()` and `.fontDesign()` modifiers rather than custom font families.
|
||||
- Support Dynamic Type — never hardcode a fixed font size as the sole option without a `.minimumScaleFactor` or system font size modifier.
|
||||
- Hierarchy: title3.bold for primary labels, subheadline for secondary, caption/caption2 for metadata.
|
||||
|
||||
### Spacing & Layout
|
||||
- Minimum touch target: **44×44 pt**. Use `.frame(minWidth: 44, minHeight: 44)` or `.contentShape(Rectangle())` on small icons.
|
||||
- Prefer 16–20 pt horizontal padding on full-width containers; 12 pt for compact inner elements.
|
||||
- Use `VStack(spacing:)` and `HStack(spacing:)` explicitly — never rely on default spacing for production UI.
|
||||
- Corner radii: 12–14 pt for cards/chips, 10 pt for small badges, 20–24 pt for large cover art.
|
||||
|
||||
---
|
||||
|
||||
## Animation Rules
|
||||
|
||||
### Spring Animations (default for all interactive transitions)
|
||||
- Use `.spring(response:dampingFraction:)` for state-driven layout changes, selection feedback, and appear/disappear transitions.
|
||||
- Recommended defaults:
|
||||
- Interactive elements: `response: 0.3, dampingFraction: 0.7`
|
||||
- Entrance animations: `response: 0.45–0.5, dampingFraction: 0.7`
|
||||
- Quick snappy feedback: `response: 0.2, dampingFraction: 0.6`
|
||||
- Reserve `.easeInOut` only for non-interactive, ambient animations (e.g. opacity pulses, generating overlays).
|
||||
|
||||
### SF Symbol Transitions
|
||||
- Always use `contentTransition(.symbolEffect(.replace.downUp))` when a symbol name changes based on state (play/pause, checkmark/circle, etc.).
|
||||
- Use `.symbolEffect(.variableColor.cumulative)` for continuous animations (waveform, loading indicators).
|
||||
- Use `.symbolEffect(.bounce)` for one-shot entrance emphasis (e.g. completion checkmark appearing).
|
||||
- Use `.symbolEffect(.pulse)` for error/warning states that need attention.
|
||||
|
||||
### Repeating Animations
|
||||
- Use `phaseAnimator` for any looping animation that previously used manual `@State` + `withAnimation` chains.
|
||||
- Do not use `Timer` publishers for UI animation — prefer `phaseAnimator` or `TimelineView`.
|
||||
|
||||
---
|
||||
|
||||
## Haptic Feedback
|
||||
|
||||
Add `UIImpactFeedbackGenerator` to every user-initiated interactive control:
|
||||
- `.light` — toggle switches, selection chips, secondary actions, slider drag start.
|
||||
- `.medium` — primary transport buttons (play/pause, chapter skip), significant confirmations.
|
||||
- `.heavy` — destructive actions (only if no confirmation dialog).
|
||||
|
||||
Pattern:
|
||||
```swift
|
||||
Button {
|
||||
UIImpactFeedbackGenerator(style: .light).impactOccurred()
|
||||
// action
|
||||
} label: { ... }
|
||||
```
|
||||
|
||||
Do **not** add haptics to:
|
||||
- Programmatic state changes not directly triggered by a tap.
|
||||
- Buttons inside `List` rows that already use swipe actions.
|
||||
- Scroll events.
|
||||
|
||||
---
|
||||
|
||||
## iOS 17+ API Usage
|
||||
|
||||
Flag and replace any of the following deprecated patterns:
|
||||
|
||||
| Deprecated | Replace with |
|
||||
|---|---|
|
||||
| `NavigationView` | `NavigationStack` |
|
||||
| `@StateObject` / `ObservableObject` (new types only) | `@Observable` macro |
|
||||
| `DispatchQueue.main.async` | `await MainActor.run` or `@MainActor` |
|
||||
| Manual `@State` animation chains for repeating loops | `phaseAnimator` |
|
||||
| `.animation(_:)` without `value:` | `.animation(_:value:)` |
|
||||
| `AnyView` wrapping for conditional content | `@ViewBuilder` + `Group` |
|
||||
|
||||
Do **not** refactor existing `ObservableObject` types to `@Observable` unless explicitly asked — only apply `@Observable` to new types.
|
||||
|
||||
---
|
||||
|
||||
## Accessibility
|
||||
|
||||
Every view must:
|
||||
- Support VoiceOver: add `.accessibilityLabel()` to icon-only buttons and image views.
|
||||
- Support Dynamic Type: test that text doesn't truncate at xxxLarge without a layout adjustment.
|
||||
- Meet contrast ratio: text on tinted backgrounds must be legible — avoid `.opacity(0.25)` or lower for any user-readable text.
|
||||
- Touch targets ≥ 44pt (see Spacing above).
|
||||
- Interactive controls must have `.accessibilityAddTraits(.isButton)` if not using `Button`.
|
||||
- Do not rely solely on color to convey state — pair color with icon or label.
|
||||
|
||||
---
|
||||
|
||||
## Performance
|
||||
|
||||
- **Isolate high-frequency observers**: Any view that observes a `PlaybackProgress` (timer-tick updates) must be a separate sub-view that `@ObservedObject`-observes only the progress object — not the parent view. This prevents the entire parent from re-rendering every 0.5 seconds.
|
||||
- **Avoid `id()` overuse**: Only use `.id()` to force view recreation when necessary (e.g. background image on track change). Prefer `onChange(of:)` for side effects.
|
||||
- **Lazy containers**: Use `LazyVStack` / `LazyHStack` inside `ScrollView` for lists of 20+ items. `List` is inherently lazy and does not need this.
|
||||
- **Image loading**: Always use `KFImage` (Kingfisher) with `.placeholder` for remote images. Never use `AsyncImage` for cover art — it has no disk cache.
|
||||
- **Avoid `AnyView`**: It breaks structural identity and hurts diffing. Use `@ViewBuilder` or `Group { }` instead.
|
||||
|
||||
---
|
||||
|
||||
## Offline & Error States
|
||||
|
||||
Every view that makes network calls must:
|
||||
1. Wrap the body in a `VStack` with `OfflineBanner` at the top, gated on `networkMonitor.isConnected`.
|
||||
2. Suppress network errors silently when offline via `ErrorAlertModifier` — do not show an alert when the device is offline.
|
||||
3. Gate `.task` / `.onAppear` network calls: `guard networkMonitor.isConnected else { return }`.
|
||||
4. Show a non-blocking inline empty state (not a full-screen error) for failed loads when online.
|
||||
|
||||
---
|
||||
|
||||
## Component Checklist (before submitting any view change)
|
||||
|
||||
- [ ] All interactive elements ≥ 44pt touch target
|
||||
- [ ] SF Symbol state changes use `contentTransition(.symbolEffect(...))`
|
||||
- [ ] State-driven layout transitions use `.spring(response:dampingFraction:)`
|
||||
- [ ] Tappable controls have haptic feedback
|
||||
- [ ] No `NavigationView`, no `DispatchQueue.main.async`, no `.animation(_:)` without `value:`
|
||||
- [ ] High-frequency observers are isolated sub-views
|
||||
- [ ] Offline state handled with `OfflineBanner` + `NetworkMonitor`
|
||||
- [ ] VoiceOver labels on icon-only buttons
|
||||
- [ ] No hardcoded `Color.black` / `Color.white` / `Color(.systemBackground)` where a material applies
|
||||
89
AGENTS.md
89
AGENTS.md
@@ -1,89 +0,0 @@
|
||||
# libnovel Project
|
||||
|
||||
Go web scraper for novelfire.net with TTS support via Kokoro-FastAPI.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
scraper/
|
||||
├── cmd/scraper/main.go # Entry point: 'run' (one-shot) and 'serve' (HTTP server)
|
||||
├── internal/
|
||||
│ ├── orchestrator/orchestrator.go # Coordinates catalogue walk, metadata extraction, chapter scraping
|
||||
│ ├── browser/ # Browser client (content/scrape/cdp strategies) via Browserless
|
||||
│ ├── novelfire/scraper.go # novelfire.net specific scraping logic
|
||||
│ ├── server/server.go # HTTP API (POST /scrape, POST /scrape/book)
|
||||
│ ├── writer/writer.go # File writer (metadata.yaml, chapter .md files)
|
||||
│ └── scraper/interfaces.go # NovelScraper interface definition
|
||||
└── static/books/ # Output directory for scraped content
|
||||
```
|
||||
|
||||
## Key Concepts
|
||||
|
||||
- **Orchestrator**: Manages concurrency - catalogue streaming → per-book metadata goroutines → chapter worker pool
|
||||
- **Browser Client**: 3 strategies (content/scrape/cdp) via Browserless Chrome container
|
||||
- **Writer**: Writes metadata.yaml and chapter markdown files to `static/books/{slug}/vol-0/1-50/`
|
||||
- **Server**: HTTP API with async scrape jobs, UI for browsing books/chapters, chapter-text endpoint for TTS
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
# Build
|
||||
cd scraper && go build -o bin/scraper ./cmd/scraper
|
||||
|
||||
# One-shot scrape (full catalogue)
|
||||
./bin/scraper run
|
||||
|
||||
# Single book
|
||||
./bin/scraper run --url https://novelfire.net/book/xxx
|
||||
|
||||
# HTTP server
|
||||
./bin/scraper serve
|
||||
|
||||
# Tests
|
||||
cd scraper && go test ./...
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| BROWSERLESS_URL | Browserless Chrome endpoint | http://localhost:3030 |
|
||||
| BROWSERLESS_STRATEGY | content \| scrape \| cdp | content |
|
||||
| SCRAPER_WORKERS | Chapter goroutines | NumCPU |
|
||||
| SCRAPER_STATIC_ROOT | Output directory | ./static/books |
|
||||
| SCRAPER_HTTP_ADDR | HTTP listen address | :8080 |
|
||||
| KOKORO_URL | Kokoro TTS endpoint | http://localhost:8880 |
|
||||
| KOKORO_VOICE | Default TTS voice | af_bella |
|
||||
| LOG_LEVEL | debug \| info \| warn \| error | info |
|
||||
|
||||
## Docker
|
||||
|
||||
```bash
|
||||
docker-compose up -d # Starts browserless, kokoro, scraper
|
||||
```
|
||||
|
||||
## Code Patterns
|
||||
|
||||
- Uses `log/slog` for structured logging
|
||||
- Context-based cancellation throughout
|
||||
- Worker pool pattern in orchestrator (channel + goroutines)
|
||||
- Mutex for single async job (409 on concurrent scrape requests)
|
||||
|
||||
## AI Context Tips
|
||||
|
||||
- Primary files to modify: `orchestrator.go`, `server.go`, `scraper.go`, `browser/*.go`
|
||||
- To add new source: implement `NovelScraper` interface from `internal/scraper/interfaces.go`
|
||||
- Skip `static/` directory - generated content, not source
|
||||
|
||||
## Speed Up AI Sessions (Optional)
|
||||
|
||||
For faster AI context loading, use **Context7** (free, local indexing):
|
||||
|
||||
```bash
|
||||
# Install and index once
|
||||
npx @context7/cli@latest index --path . --ignore .aiignore
|
||||
|
||||
# After first run, AI tools will query the index instead of re-scanning files
|
||||
```
|
||||
|
||||
VSCode extension: https://marketplace.visualstudio.com/items?itemName=context7.context7
|
||||
240
Caddyfile
Normal file
240
Caddyfile
Normal file
@@ -0,0 +1,240 @@
|
||||
# v3/Caddyfile
|
||||
#
|
||||
# Caddy reverse proxy for LibNovel v3.
|
||||
# Custom build includes github.com/mholt/caddy-ratelimit.
|
||||
#
|
||||
# Environment variables consumed (set in docker-compose.yml):
|
||||
# DOMAIN — public hostname, e.g. libnovel.example.com
|
||||
# Use "localhost" for local dev (no TLS cert attempted).
|
||||
# CADDY_ACME_EMAIL — Let's Encrypt notification email (empty = no email)
|
||||
#
|
||||
# Routing rules (main domain):
|
||||
# /health → backend:8080 (liveness probe)
|
||||
# /scrape* → backend:8080 (Go admin scrape endpoints)
|
||||
# /api/book-preview/* → backend:8080 (live scrape, no store write)
|
||||
# /api/chapter-text/* → backend:8080 (chapter markdown from MinIO)
|
||||
# /api/chapter-markdown/* → backend:8080 (chapter markdown from MinIO)
|
||||
# /api/reindex/* → backend:8080 (rebuild chapter index)
|
||||
# /api/cover/* → backend:8080 (proxy cover image)
|
||||
# /api/audio-proxy/* → backend:8080 (proxy generated audio)
|
||||
# /avatars/* → minio:9000 (presigned avatar GETs)
|
||||
# /audio/* → minio:9000 (presigned audio GETs)
|
||||
# /chapters/* → minio:9000 (presigned chapter GETs)
|
||||
# /* (everything else) → ui:3000 (SvelteKit — handles all
|
||||
# remaining /api/* routes)
|
||||
#
|
||||
# Subdomain routing:
|
||||
# feedback.libnovel.cc → fider:3000 (user feedback / feature requests)
|
||||
# errors.libnovel.cc → glitchtip-web:8000 (error tracking)
|
||||
# analytics.libnovel.cc → umami:3000 (page analytics)
|
||||
# logs.libnovel.cc → dozzle:8080 (Docker log viewer)
|
||||
# uptime.libnovel.cc → uptime-kuma:3001 (uptime monitoring)
|
||||
# push.libnovel.cc → gotify:80 (push notifications)
|
||||
#
|
||||
# Routes intentionally removed from direct-to-backend:
|
||||
# /api/scrape/* — SvelteKit has /api/scrape/ counterparts
|
||||
# that enforce auth; routing directly would
|
||||
# bypass SK middleware.
|
||||
# /api/chapter-text-preview/* — Same: SvelteKit owns
|
||||
# /api/chapter-text-preview/[slug]/[n].
|
||||
# /api/browse — Endpoint removed; browse snapshot system
|
||||
# was deleted.
|
||||
{
|
||||
# Email for Let's Encrypt ACME account registration.
|
||||
# When CADDY_ACME_EMAIL is set this expands to e.g. "email you@example.com".
|
||||
# When unset it expands to an empty string and is silently ignored.
|
||||
{$CADDY_ACME_EMAIL:}
|
||||
|
||||
# CrowdSec bouncer — streams decisions from the CrowdSec LAPI every 15s.
|
||||
# CROWDSEC_API_KEY is injected at runtime via crowdsec/.crowdsec.env.
|
||||
# The default "disabled" placeholder makes the bouncer fail-open (warn,
|
||||
# pass traffic) when no key is configured — Caddy still starts cleanly.
|
||||
crowdsec {
|
||||
api_url http://crowdsec:8080
|
||||
api_key {$CROWDSEC_API_KEY:disabled}
|
||||
ticker_interval 15s
|
||||
}
|
||||
}
|
||||
|
||||
(security_headers) {
|
||||
header {
|
||||
# Prevent clickjacking
|
||||
X-Frame-Options "SAMEORIGIN"
|
||||
# Prevent MIME-type sniffing
|
||||
X-Content-Type-Options "nosniff"
|
||||
# Minimal referrer info for cross-origin requests
|
||||
Referrer-Policy "strict-origin-when-cross-origin"
|
||||
# Restrict powerful browser features
|
||||
Permissions-Policy "camera=(), microphone=(), geolocation=(), payment=()"
|
||||
# Enforce HTTPS for 1 year (includeSubDomains)
|
||||
Strict-Transport-Security "max-age=31536000; includeSubDomains"
|
||||
# Enable XSS filter in older browsers
|
||||
X-XSS-Protection "1; mode=block"
|
||||
# Remove server identity header
|
||||
-Server
|
||||
}
|
||||
}
|
||||
|
||||
{$DOMAIN:localhost} {
|
||||
import security_headers
|
||||
|
||||
# ── CrowdSec bouncer ──────────────────────────────────────────────────────
|
||||
# Checks every incoming request against CrowdSec decisions.
|
||||
# Banned IPs receive a 403; all others pass through unchanged.
|
||||
route {
|
||||
crowdsec
|
||||
}
|
||||
|
||||
# ── Rate limiting ─────────────────────────────────────────────────────────
|
||||
# Auth endpoints: strict — 10 req/min per IP
|
||||
rate_limit {
|
||||
zone auth_zone {
|
||||
match {
|
||||
path /api/auth/login /api/auth/register /api/auth/change-password
|
||||
}
|
||||
key {remote_host}
|
||||
window 1m
|
||||
events 10
|
||||
}
|
||||
}
|
||||
|
||||
# Admin scrape endpoints: moderate — 20 req/min per IP
|
||||
rate_limit {
|
||||
zone scrape_zone {
|
||||
match {
|
||||
path /scrape*
|
||||
}
|
||||
key {remote_host}
|
||||
window 1m
|
||||
events 20
|
||||
}
|
||||
}
|
||||
|
||||
# Global: 300 req/min per IP (covers everything)
|
||||
rate_limit {
|
||||
zone global_zone {
|
||||
key {remote_host}
|
||||
window 1m
|
||||
events 300
|
||||
}
|
||||
}
|
||||
|
||||
# ── Liveness probe ────────────────────────────────────────────────────────
|
||||
handle /health {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
|
||||
# ── Scrape task creation (Go backend only) ────────────────────────────────
|
||||
handle /scrape* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
|
||||
# ── Backend-only API paths ────────────────────────────────────────────────
|
||||
# These paths are served exclusively by the Go backend and have no
|
||||
# SvelteKit counterpart. Routing them here skips SK intentionally.
|
||||
handle /api/book-preview/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
handle /api/chapter-text/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
handle /api/chapter-markdown/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
handle /api/reindex/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
handle /api/cover/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
handle /api/audio-proxy/* {
|
||||
reverse_proxy backend:8080
|
||||
}
|
||||
|
||||
# ── MinIO bucket paths (presigned URLs) ──────────────────────────────────
|
||||
# MinIO path-style presigned URLs include the bucket name as the first
|
||||
# path segment. MINIO_PUBLIC_ENDPOINT points here, so Caddy must proxy
|
||||
# these paths directly to MinIO — no auth layer needed (the presigned
|
||||
# signature itself enforces access and expiry).
|
||||
handle /avatars/* {
|
||||
reverse_proxy minio:9000
|
||||
}
|
||||
handle /audio/* {
|
||||
reverse_proxy minio:9000
|
||||
}
|
||||
handle /chapters/* {
|
||||
reverse_proxy minio:9000
|
||||
}
|
||||
|
||||
# ── SvelteKit UI (catch-all — includes all remaining /api/* routes) ───────
|
||||
handle {
|
||||
reverse_proxy ui:3000
|
||||
}
|
||||
|
||||
# ── Caddy-level error pages ───────────────────────────────────────────────
|
||||
# These fire when the upstream (backend or ui) is completely unreachable.
|
||||
# SvelteKit's own +error.svelte handles application-level errors (404, 500).
|
||||
handle_errors 502 {
|
||||
root * /srv/errors
|
||||
rewrite * /502.html
|
||||
file_server
|
||||
}
|
||||
handle_errors 503 {
|
||||
root * /srv/errors
|
||||
rewrite * /503.html
|
||||
file_server
|
||||
}
|
||||
handle_errors 504 {
|
||||
root * /srv/errors
|
||||
rewrite * /504.html
|
||||
file_server
|
||||
}
|
||||
|
||||
# ── Logging ───────────────────────────────────────────────────────────────
|
||||
# JSON log file read by CrowdSec for threat detection.
|
||||
log {
|
||||
output file /var/log/caddy/access.log {
|
||||
roll_size 100MiB
|
||||
roll_keep 5
|
||||
roll_keep_for 720h
|
||||
}
|
||||
format json
|
||||
}
|
||||
}
|
||||
|
||||
# ── Fider: user feedback & feature requests ───────────────────────────────────
|
||||
feedback.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy fider:3000
|
||||
}
|
||||
|
||||
# ── GlitchTip: error tracking ─────────────────────────────────────────────────
|
||||
errors.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy glitchtip-web:8000
|
||||
}
|
||||
|
||||
# ── Umami: page analytics ─────────────────────────────────────────────────────
|
||||
analytics.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy umami:3000
|
||||
}
|
||||
|
||||
# ── Dozzle: Docker log viewer ─────────────────────────────────────────────────
|
||||
logs.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy dozzle:8080
|
||||
}
|
||||
|
||||
# ── Uptime Kuma: uptime monitoring ────────────────────────────────────────────
|
||||
uptime.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy uptime-kuma:3001
|
||||
}
|
||||
|
||||
# ── Gotify: push notifications ────────────────────────────────────────────────
|
||||
push.libnovel.cc {
|
||||
import security_headers
|
||||
reverse_proxy gotify:80
|
||||
}
|
||||
}
|
||||
38
README.md
Normal file
38
README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# LibNovel
|
||||
|
||||
Self-hosted audiobook platform. Go backend + SvelteKit UI + MinIO/PocketBase/Meilisearch.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Docker + Docker Compose
|
||||
- [just](https://github.com/casey/just)
|
||||
- [Doppler CLI](https://docs.doppler.com/docs/install-cli)
|
||||
|
||||
## Setup
|
||||
|
||||
```sh
|
||||
doppler login
|
||||
doppler setup # project=libnovel, config=prd
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```sh
|
||||
just up # start everything
|
||||
just down # stop
|
||||
just logs # tail all logs
|
||||
just log backend # tail one service
|
||||
just build # rebuild images
|
||||
just restart # down + up
|
||||
just secrets # view/edit secrets
|
||||
```
|
||||
|
||||
## Secrets
|
||||
|
||||
Managed via Doppler (`project=libnovel`, `config=prd`). No `.env` files.
|
||||
|
||||
To add or update a secret:
|
||||
|
||||
```sh
|
||||
doppler secrets set MY_SECRET=value
|
||||
```
|
||||
13
backend/.dockerignore
Normal file
13
backend/.dockerignore
Normal file
@@ -0,0 +1,13 @@
|
||||
# Exclude compiled binaries
|
||||
bin/
|
||||
|
||||
# Exclude test binaries produced by `go test -c`
|
||||
*.test
|
||||
|
||||
# Git history is not needed inside the image
|
||||
.git/
|
||||
|
||||
# Editor/OS noise
|
||||
.DS_Store
|
||||
*.swp
|
||||
*.swo
|
||||
42
backend/Dockerfile
Normal file
42
backend/Dockerfile
Normal file
@@ -0,0 +1,42 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM golang:1.26.1-alpine AS builder
|
||||
WORKDIR /app
|
||||
|
||||
# Download modules into the BuildKit cache so they survive across builds.
|
||||
# This layer is only invalidated when go.mod or go.sum changes.
|
||||
COPY go.mod go.sum ./
|
||||
RUN --mount=type=cache,target=/root/go/pkg/mod \
|
||||
go mod download
|
||||
|
||||
COPY . .
|
||||
|
||||
ARG VERSION=dev
|
||||
ARG COMMIT=unknown
|
||||
|
||||
# Build all three binaries in a single layer so the Go compiler can reuse
|
||||
# intermediate object files. Both cache mounts are preserved between builds:
|
||||
# /root/go/pkg/mod — downloaded module source
|
||||
# /root/.cache/go-build — compiled package objects (incremental recompile)
|
||||
RUN --mount=type=cache,target=/root/go/pkg/mod \
|
||||
--mount=type=cache,target=/root/.cache/go-build \
|
||||
CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags="-s -w -X main.version=${VERSION} -X main.commit=${COMMIT}" \
|
||||
-o /out/backend ./cmd/backend && \
|
||||
CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags="-s -w -X main.version=${VERSION} -X main.commit=${COMMIT}" \
|
||||
-o /out/runner ./cmd/runner && \
|
||||
CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags="-s -w" \
|
||||
-o /out/healthcheck ./cmd/healthcheck
|
||||
|
||||
# ── backend service ──────────────────────────────────────────────────────────
|
||||
FROM gcr.io/distroless/static:nonroot AS backend
|
||||
COPY --from=builder /out/healthcheck /healthcheck
|
||||
COPY --from=builder /out/backend /backend
|
||||
ENTRYPOINT ["/backend"]
|
||||
|
||||
# ── runner service ───────────────────────────────────────────────────────────
|
||||
FROM gcr.io/distroless/static:nonroot AS runner
|
||||
COPY --from=builder /out/healthcheck /healthcheck
|
||||
COPY --from=builder /out/runner /runner
|
||||
ENTRYPOINT ["/runner"]
|
||||
153
backend/cmd/backend/main.go
Normal file
153
backend/cmd/backend/main.go
Normal file
@@ -0,0 +1,153 @@
|
||||
// Command backend is the LibNovel HTTP API server.
|
||||
//
|
||||
// It exposes all endpoints consumed by the SvelteKit UI: book/chapter reads,
|
||||
// scrape-task creation, presigned MinIO URLs, audio-task creation, reading
|
||||
// progress, live novelfire.net search, and Kokoro voice list.
|
||||
//
|
||||
// All heavy lifting (scraping, TTS generation) is delegated to the runner
|
||||
// binary via PocketBase task records. The backend never scrapes directly.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// backend # start HTTP server (blocks until SIGINT/SIGTERM)
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/getsentry/sentry-go"
|
||||
"github.com/libnovel/backend/internal/backend"
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/storage"
|
||||
)
|
||||
|
||||
// version and commit are set at build time via -ldflags.
|
||||
var (
|
||||
version = "dev"
|
||||
commit = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "backend: fatal: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run() error {
|
||||
cfg := config.Load()
|
||||
|
||||
// ── Sentry / GlitchTip error tracking ────────────────────────────────────
|
||||
if dsn := os.Getenv("GLITCHTIP_DSN"); dsn != "" {
|
||||
if err := sentry.Init(sentry.ClientOptions{
|
||||
Dsn: dsn,
|
||||
Release: version + "@" + commit,
|
||||
TracesSampleRate: 0.1,
|
||||
}); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "backend: sentry init warning: %v\n", err)
|
||||
} else {
|
||||
defer sentry.Flush(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Logger ───────────────────────────────────────────────────────────────
|
||||
log := buildLogger(cfg.LogLevel)
|
||||
log.Info("backend starting",
|
||||
"version", version,
|
||||
"commit", commit,
|
||||
"addr", cfg.HTTP.Addr,
|
||||
)
|
||||
|
||||
// ── Context: cancel on SIGINT / SIGTERM ──────────────────────────────────
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
// ── Storage ──────────────────────────────────────────────────────────────
|
||||
store, err := storage.NewStore(ctx, cfg, log)
|
||||
if err != nil {
|
||||
return fmt.Errorf("init storage: %w", err)
|
||||
}
|
||||
|
||||
// ── Kokoro (voice list only; audio generation is done by the runner) ─────
|
||||
var kokoroClient kokoro.Client
|
||||
if cfg.Kokoro.URL != "" {
|
||||
kokoroClient = kokoro.New(cfg.Kokoro.URL)
|
||||
log.Info("kokoro voices enabled", "url", cfg.Kokoro.URL)
|
||||
} else {
|
||||
log.Info("KOKORO_URL not set — voice list will use built-in fallback")
|
||||
kokoroClient = &noopKokoro{}
|
||||
}
|
||||
|
||||
// ── Meilisearch (search reads only; indexing is the runner's job) ────────
|
||||
var searchIndex meili.Client
|
||||
if cfg.Meilisearch.URL != "" {
|
||||
searchIndex = meili.New(cfg.Meilisearch.URL, cfg.Meilisearch.APIKey)
|
||||
log.Info("meilisearch search enabled", "url", cfg.Meilisearch.URL)
|
||||
} else {
|
||||
log.Info("MEILI_URL not set — search will use PocketBase substring fallback")
|
||||
searchIndex = meili.NoopClient{}
|
||||
}
|
||||
|
||||
// ── Backend server ───────────────────────────────────────────────────────
|
||||
srv := backend.New(
|
||||
backend.Config{
|
||||
Addr: cfg.HTTP.Addr,
|
||||
DefaultVoice: cfg.Kokoro.DefaultVoice,
|
||||
Version: version,
|
||||
Commit: commit,
|
||||
},
|
||||
backend.Dependencies{
|
||||
BookReader: store,
|
||||
RankingStore: store,
|
||||
AudioStore: store,
|
||||
PresignStore: store,
|
||||
ProgressStore: store,
|
||||
CoverStore: store,
|
||||
Producer: store,
|
||||
TaskReader: store,
|
||||
SearchIndex: searchIndex,
|
||||
Kokoro: kokoroClient,
|
||||
Log: log,
|
||||
},
|
||||
)
|
||||
|
||||
return srv.ListenAndServe(ctx)
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func buildLogger(level string) *slog.Logger {
|
||||
var lvl slog.Level
|
||||
switch level {
|
||||
case "debug":
|
||||
lvl = slog.LevelDebug
|
||||
case "warn":
|
||||
lvl = slog.LevelWarn
|
||||
case "error":
|
||||
lvl = slog.LevelError
|
||||
default:
|
||||
lvl = slog.LevelInfo
|
||||
}
|
||||
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}))
|
||||
}
|
||||
|
||||
// noopKokoro is a no-op implementation used when KOKORO_URL is not set.
|
||||
// The backend only uses Kokoro for the voice list; audio generation is the
|
||||
// runner's responsibility. With no URL the built-in fallback list is served.
|
||||
type noopKokoro struct{}
|
||||
|
||||
func (n *noopKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
|
||||
}
|
||||
|
||||
func (n *noopKokoro) ListVoices(_ context.Context) ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
57
backend/cmd/backend/main_test.go
Normal file
57
backend/cmd/backend/main_test.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestBuildLogger verifies that buildLogger returns a non-nil logger for each
|
||||
// supported log level string and for unknown values.
|
||||
func TestBuildLogger(t *testing.T) {
|
||||
for _, level := range []string{"debug", "info", "warn", "error", "unknown", ""} {
|
||||
l := buildLogger(level)
|
||||
if l == nil {
|
||||
t.Errorf("buildLogger(%q) returned nil", level)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNoopKokoro verifies that the no-op Kokoro stub returns the expected
|
||||
// sentinel error from GenerateAudio and nil, nil from ListVoices.
|
||||
func TestNoopKokoro(t *testing.T) {
|
||||
noop := &noopKokoro{}
|
||||
|
||||
_, err := noop.GenerateAudio(t.Context(), "text", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("noopKokoro.GenerateAudio: expected error, got nil")
|
||||
}
|
||||
|
||||
voices, err := noop.ListVoices(t.Context())
|
||||
if err != nil {
|
||||
t.Fatalf("noopKokoro.ListVoices: unexpected error: %v", err)
|
||||
}
|
||||
if voices != nil {
|
||||
t.Fatalf("noopKokoro.ListVoices: expected nil slice, got %v", voices)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunStorageUnreachable verifies that run() fails fast and returns a
|
||||
// descriptive error when PocketBase is unreachable.
|
||||
func TestRunStorageUnreachable(t *testing.T) {
|
||||
// Point at an address nothing is listening on.
|
||||
t.Setenv("POCKETBASE_URL", "http://127.0.0.1:19999")
|
||||
// Use a fast listen address so we don't accidentally start a real server.
|
||||
t.Setenv("BACKEND_HTTP_ADDR", "127.0.0.1:0")
|
||||
|
||||
err := run()
|
||||
if err == nil {
|
||||
t.Fatal("run() should have returned an error when storage is unreachable")
|
||||
}
|
||||
|
||||
t.Logf("got expected error: %v", err)
|
||||
}
|
||||
|
||||
// TestMain runs the test suite. No special setup required.
|
||||
func TestMain(m *testing.M) {
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
89
backend/cmd/healthcheck/main.go
Normal file
89
backend/cmd/healthcheck/main.go
Normal file
@@ -0,0 +1,89 @@
|
||||
// healthcheck is a static binary used by Docker HEALTHCHECK CMD in distroless
|
||||
// images (which have no shell, wget, or curl).
|
||||
//
|
||||
// Two modes:
|
||||
//
|
||||
// 1. HTTP mode (default):
|
||||
// /healthcheck <url>
|
||||
// Performs GET <url>; exits 0 if HTTP 2xx/3xx, 1 otherwise.
|
||||
// Example: /healthcheck http://localhost:8080/health
|
||||
//
|
||||
// 2. File-liveness mode:
|
||||
// /healthcheck file <path> <max_age_seconds>
|
||||
// Reads <path>, parses its content as RFC3339 timestamp, and exits 1 if the
|
||||
// timestamp is older than <max_age_seconds>. Used by the runner service which
|
||||
// writes /tmp/runner.alive on every successful poll.
|
||||
// Example: /healthcheck file /tmp/runner.alive 120
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) > 1 && os.Args[1] == "file" {
|
||||
checkFile()
|
||||
return
|
||||
}
|
||||
checkHTTP()
|
||||
}
|
||||
|
||||
// checkHTTP performs a GET request and exits 0 on success, 1 on failure.
|
||||
func checkHTTP() {
|
||||
url := "http://localhost:8080/health"
|
||||
if len(os.Args) > 1 {
|
||||
url = os.Args[1]
|
||||
}
|
||||
resp, err := http.Get(url) //nolint:gosec,noctx
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode >= 400 {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck: status %d\n", resp.StatusCode)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// checkFile reads a timestamp from a file and exits 1 if it is older than the
|
||||
// given max age. Usage: /healthcheck file <path> <max_age_seconds>
|
||||
func checkFile() {
|
||||
if len(os.Args) < 4 {
|
||||
fmt.Fprintln(os.Stderr, "healthcheck file: usage: /healthcheck file <path> <max_age_seconds>")
|
||||
os.Exit(1)
|
||||
}
|
||||
path := os.Args[2]
|
||||
maxAgeSec, err := strconv.ParseInt(os.Args[3], 10, 64)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck file: invalid max_age_seconds %q: %v\n", os.Args[3], err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck file: cannot read %s: %v\n", path, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ts, err := time.Parse(time.RFC3339, string(data))
|
||||
if err != nil {
|
||||
// Fallback: use file mtime if content is not a valid timestamp.
|
||||
info, statErr := os.Stat(path)
|
||||
if statErr != nil {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck file: cannot stat %s: %v\n", path, statErr)
|
||||
os.Exit(1)
|
||||
}
|
||||
ts = info.ModTime()
|
||||
}
|
||||
|
||||
age := time.Since(ts)
|
||||
if age > time.Duration(maxAgeSec)*time.Second {
|
||||
fmt.Fprintf(os.Stderr, "healthcheck file: %s is %.0fs old (max %ds)\n", path, age.Seconds(), maxAgeSec)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
173
backend/cmd/runner/main.go
Normal file
173
backend/cmd/runner/main.go
Normal file
@@ -0,0 +1,173 @@
|
||||
// Command runner is the homelab worker binary.
|
||||
//
|
||||
// It polls PocketBase for pending scrape and audio tasks, executes them, and
|
||||
// writes results back. It connects directly to PocketBase and MinIO using
|
||||
// admin credentials loaded from environment variables.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// runner # start polling loop (blocks until SIGINT/SIGTERM)
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/getsentry/sentry-go"
|
||||
"github.com/libnovel/backend/internal/browser"
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/novelfire"
|
||||
"github.com/libnovel/backend/internal/runner"
|
||||
"github.com/libnovel/backend/internal/storage"
|
||||
)
|
||||
|
||||
// version and commit are set at build time via -ldflags.
|
||||
var (
|
||||
version = "dev"
|
||||
commit = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if err := run(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "runner: fatal: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run() error {
|
||||
cfg := config.Load()
|
||||
|
||||
// ── Sentry / GlitchTip error tracking ────────────────────────────────────
|
||||
if dsn := os.Getenv("GLITCHTIP_DSN"); dsn != "" {
|
||||
if err := sentry.Init(sentry.ClientOptions{
|
||||
Dsn: dsn,
|
||||
Release: version + "@" + commit,
|
||||
TracesSampleRate: 0.1,
|
||||
}); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "runner: sentry init warning: %v\n", err)
|
||||
} else {
|
||||
defer sentry.Flush(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Logger ──────────────────────────────────────────────────────────────
|
||||
log := buildLogger(cfg.LogLevel)
|
||||
log.Info("runner starting",
|
||||
"version", version,
|
||||
"commit", commit,
|
||||
"worker_id", cfg.Runner.WorkerID,
|
||||
)
|
||||
|
||||
// ── Context: cancel on SIGINT / SIGTERM ─────────────────────────────────
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────
|
||||
store, err := storage.NewStore(ctx, cfg, log)
|
||||
if err != nil {
|
||||
return fmt.Errorf("init storage: %w", err)
|
||||
}
|
||||
|
||||
// ── Browser / Scraper ───────────────────────────────────────────────────
|
||||
workers := cfg.Runner.Workers
|
||||
if workers <= 0 {
|
||||
workers = runtime.NumCPU()
|
||||
}
|
||||
timeout := cfg.Runner.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = 90 * time.Second
|
||||
}
|
||||
|
||||
browserClient := browser.NewDirectClient(browser.Config{
|
||||
MaxConcurrent: workers,
|
||||
Timeout: timeout,
|
||||
})
|
||||
novel := novelfire.New(browserClient, log)
|
||||
|
||||
// ── Kokoro ──────────────────────────────────────────────────────────────
|
||||
var kokoroClient kokoro.Client
|
||||
if cfg.Kokoro.URL != "" {
|
||||
kokoroClient = kokoro.New(cfg.Kokoro.URL)
|
||||
log.Info("kokoro TTS enabled", "url", cfg.Kokoro.URL)
|
||||
} else {
|
||||
log.Warn("KOKORO_URL not set — audio tasks will fail")
|
||||
kokoroClient = &noopKokoro{}
|
||||
}
|
||||
|
||||
// ── Meilisearch ─────────────────────────────────────────────────────────
|
||||
var searchIndex meili.Client
|
||||
if cfg.Meilisearch.URL != "" {
|
||||
if err := meili.Configure(cfg.Meilisearch.URL, cfg.Meilisearch.APIKey); err != nil {
|
||||
log.Warn("meilisearch configure failed — search indexing disabled", "err", err)
|
||||
searchIndex = meili.NoopClient{}
|
||||
} else {
|
||||
searchIndex = meili.New(cfg.Meilisearch.URL, cfg.Meilisearch.APIKey)
|
||||
log.Info("meilisearch enabled", "url", cfg.Meilisearch.URL)
|
||||
}
|
||||
} else {
|
||||
log.Info("MEILI_URL not set — search indexing disabled")
|
||||
searchIndex = meili.NoopClient{}
|
||||
}
|
||||
|
||||
// ── Runner ──────────────────────────────────────────────────────────────
|
||||
rCfg := runner.Config{
|
||||
WorkerID: cfg.Runner.WorkerID,
|
||||
PollInterval: cfg.Runner.PollInterval,
|
||||
MaxConcurrentScrape: cfg.Runner.MaxConcurrentScrape,
|
||||
MaxConcurrentAudio: cfg.Runner.MaxConcurrentAudio,
|
||||
OrchestratorWorkers: workers,
|
||||
MetricsAddr: cfg.Runner.MetricsAddr,
|
||||
CatalogueRefreshInterval: cfg.Runner.CatalogueRefreshInterval,
|
||||
SkipInitialCatalogueRefresh: cfg.Runner.SkipInitialCatalogueRefresh,
|
||||
}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: store,
|
||||
BookWriter: store,
|
||||
BookReader: store,
|
||||
AudioStore: store,
|
||||
CoverStore: store,
|
||||
SearchIndex: searchIndex,
|
||||
Novel: novel,
|
||||
Kokoro: kokoroClient,
|
||||
Log: log,
|
||||
}
|
||||
r := runner.New(rCfg, deps)
|
||||
|
||||
return r.Run(ctx)
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func buildLogger(level string) *slog.Logger {
|
||||
var lvl slog.Level
|
||||
switch level {
|
||||
case "debug":
|
||||
lvl = slog.LevelDebug
|
||||
case "warn":
|
||||
lvl = slog.LevelWarn
|
||||
case "error":
|
||||
lvl = slog.LevelError
|
||||
default:
|
||||
lvl = slog.LevelInfo
|
||||
}
|
||||
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}))
|
||||
}
|
||||
|
||||
// noopKokoro is a no-op implementation used when KOKORO_URL is not set.
|
||||
type noopKokoro struct{}
|
||||
|
||||
func (n *noopKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("kokoro not configured (KOKORO_URL is empty)")
|
||||
}
|
||||
|
||||
func (n *noopKokoro) ListVoices(_ context.Context) ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
37
backend/go.mod
Normal file
37
backend/go.mod
Normal file
@@ -0,0 +1,37 @@
|
||||
module github.com/libnovel/backend
|
||||
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/minio/minio-go/v7 v7.0.98
|
||||
golang.org/x/net v0.51.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/andybalholm/brotli v1.1.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/getsentry/sentry-go v0.43.0 // indirect
|
||||
github.com/go-ini/ini v1.67.0 // indirect
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/klauspost/compress v1.18.2 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.11 // indirect
|
||||
github.com/klauspost/crc32 v1.3.0 // indirect
|
||||
github.com/meilisearch/meilisearch-go v0.36.1 // indirect
|
||||
github.com/minio/crc64nvme v1.1.1 // indirect
|
||||
github.com/minio/md5-simd v1.1.2 // indirect
|
||||
github.com/philhofer/fwd v1.2.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/redis/go-redis/v9 v9.18.0 // indirect
|
||||
github.com/rs/xid v1.6.0 // indirect
|
||||
github.com/tinylib/msgp v1.6.1 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/crypto v0.48.0 // indirect
|
||||
golang.org/x/sys v0.41.0 // indirect
|
||||
golang.org/x/text v0.34.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
64
backend/go.sum
Normal file
64
backend/go.sum
Normal file
@@ -0,0 +1,64 @@
|
||||
github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
|
||||
github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/getsentry/sentry-go v0.43.0 h1:XbXLpFicpo8HmBDaInk7dum18G9KSLcjZiyUKS+hLW4=
|
||||
github.com/getsentry/sentry-go v0.43.0/go.mod h1:XDotiNZbgf5U8bPDUAfvcFmOnMQQceESxyKaObSssW0=
|
||||
github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
|
||||
github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
|
||||
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.11 h1:0OwqZRYI2rFrjS4kvkDnqJkKHdHaRnCm68/DY4OxRzU=
|
||||
github.com/klauspost/cpuid/v2 v2.2.11/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
|
||||
github.com/klauspost/crc32 v1.3.0 h1:sSmTt3gUt81RP655XGZPElI0PelVTZ6YwCRnPSupoFM=
|
||||
github.com/klauspost/crc32 v1.3.0/go.mod h1:D7kQaZhnkX/Y0tstFGf8VUzv2UofNGqCjnC3zdHB0Hw=
|
||||
github.com/meilisearch/meilisearch-go v0.36.1 h1:mJTCJE5g7tRvaqKco6DfqOuJEjX+rRltDEnkEC02Y0M=
|
||||
github.com/meilisearch/meilisearch-go v0.36.1/go.mod h1:hWcR0MuWLSzHfbz9GGzIr3s9rnXLm1jqkmHkJPbUSvM=
|
||||
github.com/minio/crc64nvme v1.1.1 h1:8dwx/Pz49suywbO+auHCBpCtlW1OfpcLN7wYgVR6wAI=
|
||||
github.com/minio/crc64nvme v1.1.1/go.mod h1:eVfm2fAzLlxMdUGc0EEBGSMmPwmXD5XiNRpnu9J3bvg=
|
||||
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
||||
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
||||
github.com/minio/minio-go/v7 v7.0.98 h1:MeAVKjLVz+XJ28zFcuYyImNSAh8Mq725uNW4beRisi0=
|
||||
github.com/minio/minio-go/v7 v7.0.98/go.mod h1:cY0Y+W7yozf0mdIclrttzo1Iiu7mEf9y7nk2uXqMOvM=
|
||||
github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
|
||||
github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/redis/go-redis/v9 v9.18.0 h1:pMkxYPkEbMPwRdenAzUNyFNrDgHx9U+DrBabWNfSRQs=
|
||||
github.com/redis/go-redis/v9 v9.18.0/go.mod h1:k3ufPphLU5YXwNTUcCRXGxUoF1fqxnhFQmscfkCoDA0=
|
||||
github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
|
||||
github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/tinylib/msgp v1.6.1 h1:ESRv8eL3u+DNHUoSAAQRE50Hm162zqAnBoGv9PzScPY=
|
||||
github.com/tinylib/msgp v1.6.1/go.mod h1:RSp0LW9oSxFut3KzESt5Voq4GVWyS+PSulT77roAqEA=
|
||||
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
|
||||
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
|
||||
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
|
||||
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
||||
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
|
||||
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
1178
backend/internal/backend/handlers.go
Normal file
1178
backend/internal/backend/handlers.go
Normal file
File diff suppressed because it is too large
Load Diff
303
backend/internal/backend/server.go
Normal file
303
backend/internal/backend/server.go
Normal file
@@ -0,0 +1,303 @@
|
||||
// Package backend implements the HTTP API server for the LibNovel backend.
|
||||
//
|
||||
// The server exposes all endpoints consumed by the SvelteKit UI:
|
||||
// - Book/chapter reads from PocketBase/MinIO via bookstore interfaces
|
||||
// - Task creation (scrape + audio) via taskqueue.Producer — the runner binary
|
||||
// picks up and executes those tasks asynchronously
|
||||
// - Presigned MinIO URLs for media playback/upload
|
||||
// - Session-scoped reading progress
|
||||
// - Live novelfire.net search (no scraper interface needed; direct HTTP)
|
||||
// - Kokoro voice list
|
||||
//
|
||||
// The backend never scrapes directly. All scraping (metadata, chapter list,
|
||||
// chapter text, audio TTS) is delegated to the runner binary via PocketBase
|
||||
// task records. GET /api/book-preview enqueues a task when the book is absent.
|
||||
//
|
||||
// All external dependencies are injected as interfaces; concrete types live in
|
||||
// internal/storage and are wired by cmd/backend/main.go.
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
sentryhttp "github.com/getsentry/sentry-go/http"
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
|
||||
// Dependencies holds all external services the backend server depends on.
|
||||
// Every field is an interface so test doubles can be injected freely.
|
||||
type Dependencies struct {
|
||||
// BookReader reads book metadata and chapter text from PocketBase/MinIO.
|
||||
BookReader bookstore.BookReader
|
||||
// RankingStore reads ranking data from PocketBase.
|
||||
RankingStore bookstore.RankingStore
|
||||
// AudioStore checks audio object existence and computes MinIO keys.
|
||||
AudioStore bookstore.AudioStore
|
||||
// PresignStore generates short-lived MinIO URLs.
|
||||
PresignStore bookstore.PresignStore
|
||||
// ProgressStore reads/writes per-session reading progress.
|
||||
ProgressStore bookstore.ProgressStore
|
||||
// CoverStore reads and writes book cover images from MinIO.
|
||||
// If nil, the cover endpoint falls back to a CDN redirect.
|
||||
CoverStore bookstore.CoverStore
|
||||
// Producer creates scrape/audio tasks in PocketBase.
|
||||
Producer taskqueue.Producer
|
||||
// TaskReader reads scrape/audio task records from PocketBase.
|
||||
TaskReader taskqueue.Reader
|
||||
// SearchIndex provides full-text book search via Meilisearch.
|
||||
// If nil, the local-only fallback search is used.
|
||||
SearchIndex meili.Client
|
||||
// Kokoro is the TTS client (used for voice list only in the backend;
|
||||
// audio generation is done by the runner).
|
||||
Kokoro kokoro.Client
|
||||
// Log is the structured logger.
|
||||
Log *slog.Logger
|
||||
}
|
||||
|
||||
// Config holds HTTP server tuning parameters.
|
||||
type Config struct {
|
||||
// Addr is the listen address, e.g. ":8080".
|
||||
Addr string
|
||||
// DefaultVoice is used when no voice is specified in audio requests.
|
||||
DefaultVoice string
|
||||
// Version and Commit are embedded in /health and /api/version responses.
|
||||
Version string
|
||||
Commit string
|
||||
}
|
||||
|
||||
// Server is the HTTP API server.
|
||||
type Server struct {
|
||||
cfg Config
|
||||
deps Dependencies
|
||||
|
||||
// voiceMu guards cachedVoices. Populated lazily on first GET /api/voices.
|
||||
voiceMu sync.RWMutex
|
||||
cachedVoices []string
|
||||
}
|
||||
|
||||
// New creates a Server from cfg and deps.
|
||||
func New(cfg Config, deps Dependencies) *Server {
|
||||
if cfg.DefaultVoice == "" {
|
||||
cfg.DefaultVoice = "af_bella"
|
||||
}
|
||||
if deps.Log == nil {
|
||||
deps.Log = slog.Default()
|
||||
}
|
||||
if deps.SearchIndex == nil {
|
||||
deps.SearchIndex = meili.NoopClient{}
|
||||
}
|
||||
return &Server{cfg: cfg, deps: deps}
|
||||
}
|
||||
|
||||
// ListenAndServe registers all routes and starts the HTTP server.
|
||||
// It blocks until ctx is cancelled, then performs a graceful shutdown.
|
||||
func (s *Server) ListenAndServe(ctx context.Context) error {
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// Health / version
|
||||
mux.HandleFunc("GET /health", s.handleHealth)
|
||||
mux.HandleFunc("GET /api/version", s.handleVersion)
|
||||
|
||||
// Scrape task creation (202 Accepted — runner executes asynchronously)
|
||||
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
|
||||
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
|
||||
mux.HandleFunc("POST /scrape/book/range", s.handleScrapeBookRange)
|
||||
|
||||
// Scrape task status / history
|
||||
mux.HandleFunc("GET /api/scrape/status", s.handleScrapeStatus)
|
||||
mux.HandleFunc("GET /api/scrape/tasks", s.handleScrapeTasks)
|
||||
|
||||
// Cancel a pending task (scrape or audio)
|
||||
mux.HandleFunc("POST /api/cancel-task/{id}", s.handleCancelTask)
|
||||
|
||||
// Browse & search
|
||||
mux.HandleFunc("GET /api/search", s.handleSearch)
|
||||
|
||||
// Catalogue (Meilisearch-backed browse + search — preferred path for UI)
|
||||
mux.HandleFunc("GET /api/catalogue", s.handleCatalogue)
|
||||
|
||||
// Ranking (from PocketBase)
|
||||
mux.HandleFunc("GET /api/ranking", s.handleGetRanking)
|
||||
|
||||
// Cover proxy (live URL redirect)
|
||||
mux.HandleFunc("GET /api/cover/{domain}/{slug}", s.handleGetCover)
|
||||
|
||||
// Book preview (enqueues scrape task if not in library; returns stored data if already scraped)
|
||||
mux.HandleFunc("GET /api/book-preview/{slug}", s.handleBookPreview)
|
||||
|
||||
// Chapter text (served from MinIO via PocketBase index)
|
||||
mux.HandleFunc("GET /api/chapter-text/{slug}/{n}", s.handleChapterText)
|
||||
// Raw markdown chapter content — served directly from MinIO by the backend.
|
||||
// Use this instead of presign+fetch to avoid SvelteKit→MinIO network path.
|
||||
mux.HandleFunc("GET /api/chapter-markdown/{slug}/{n}", s.handleChapterMarkdown)
|
||||
|
||||
// Chapter text preview — live scrape from novelfire.net, no store writes.
|
||||
// Used when the chapter is not yet in the library (preview mode).
|
||||
mux.HandleFunc("GET /api/chapter-text-preview/{slug}/{n}", s.handleChapterTextPreview)
|
||||
|
||||
// Reindex chapters_idx from MinIO
|
||||
mux.HandleFunc("POST /api/reindex/{slug}", s.handleReindex)
|
||||
|
||||
// Audio task creation (backend creates task; runner executes)
|
||||
mux.HandleFunc("POST /api/audio/{slug}/{n}", s.handleAudioGenerate)
|
||||
mux.HandleFunc("GET /api/audio/status/{slug}/{n}", s.handleAudioStatus)
|
||||
mux.HandleFunc("GET /api/audio-proxy/{slug}/{n}", s.handleAudioProxy)
|
||||
|
||||
// Voices list
|
||||
mux.HandleFunc("GET /api/voices", s.handleVoices)
|
||||
|
||||
// Presigned URLs
|
||||
mux.HandleFunc("GET /api/presign/chapter/{slug}/{n}", s.handlePresignChapter)
|
||||
mux.HandleFunc("GET /api/presign/audio/{slug}/{n}", s.handlePresignAudio)
|
||||
mux.HandleFunc("GET /api/presign/voice-sample/{voice}", s.handlePresignVoiceSample)
|
||||
mux.HandleFunc("GET /api/presign/avatar-upload/{userId}", s.handlePresignAvatarUpload)
|
||||
mux.HandleFunc("GET /api/presign/avatar/{userId}", s.handlePresignAvatar)
|
||||
mux.HandleFunc("PUT /api/avatar-upload/{userId}", s.handleAvatarUpload)
|
||||
|
||||
// Reading progress
|
||||
mux.HandleFunc("GET /api/progress", s.handleGetProgress)
|
||||
mux.HandleFunc("POST /api/progress/{slug}", s.handleSetProgress)
|
||||
mux.HandleFunc("DELETE /api/progress/{slug}", s.handleDeleteProgress)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: s.cfg.Addr,
|
||||
Handler: sentryhttp.New(sentryhttp.Options{Repanic: true}).Handle(mux),
|
||||
ReadTimeout: 15 * time.Second,
|
||||
WriteTimeout: 60 * time.Second,
|
||||
IdleTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() { errCh <- srv.ListenAndServe() }()
|
||||
s.deps.Log.Info("backend: HTTP server listening", "addr", s.cfg.Addr)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
s.deps.Log.Info("backend: context cancelled, starting graceful shutdown")
|
||||
shutCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
if err := srv.Shutdown(shutCtx); err != nil {
|
||||
s.deps.Log.Error("backend: graceful shutdown failed", "err", err)
|
||||
return err
|
||||
}
|
||||
s.deps.Log.Info("backend: shutdown complete")
|
||||
return nil
|
||||
case err := <-errCh:
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// ── Session cookie helpers ─────────────────────────────────────────────────────
|
||||
|
||||
const sessionCookieName = "libnovel_session"
|
||||
|
||||
func sessionID(r *http.Request) string {
|
||||
c, err := r.Cookie(sessionCookieName)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return c.Value
|
||||
}
|
||||
|
||||
func newSessionID() (string, error) {
|
||||
b := make([]byte, 16)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(b), nil
|
||||
}
|
||||
|
||||
func ensureSession(w http.ResponseWriter, r *http.Request) string {
|
||||
if id := sessionID(r); id != "" {
|
||||
return id
|
||||
}
|
||||
id, err := newSessionID()
|
||||
if err != nil {
|
||||
id = fmt.Sprintf("fallback-%d", time.Now().UnixNano())
|
||||
}
|
||||
http.SetCookie(w, &http.Cookie{
|
||||
Name: sessionCookieName,
|
||||
Value: id,
|
||||
Path: "/",
|
||||
HttpOnly: true,
|
||||
SameSite: http.SameSiteLaxMode,
|
||||
MaxAge: 365 * 24 * 60 * 60,
|
||||
})
|
||||
return id
|
||||
}
|
||||
|
||||
// ── Utility helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
// writeJSON writes v as a JSON response with status code. Status 0 → 200.
|
||||
func writeJSON(w http.ResponseWriter, status int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if status != 0 {
|
||||
w.WriteHeader(status)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
|
||||
// jsonError writes a JSON error body and the given status code.
|
||||
func jsonError(w http.ResponseWriter, status int, msg string) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"error": msg})
|
||||
}
|
||||
|
||||
// voices returns the list of available Kokoro voices. On the first call it
|
||||
// fetches from the Kokoro service and caches the result. Falls back to the
|
||||
// hardcoded list on error.
|
||||
func (s *Server) voices(ctx context.Context) []string {
|
||||
s.voiceMu.RLock()
|
||||
cached := s.cachedVoices
|
||||
s.voiceMu.RUnlock()
|
||||
if len(cached) > 0 {
|
||||
return cached
|
||||
}
|
||||
|
||||
if s.deps.Kokoro == nil {
|
||||
return kokoroVoices
|
||||
}
|
||||
|
||||
fetchCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
list, err := s.deps.Kokoro.ListVoices(fetchCtx)
|
||||
if err != nil || len(list) == 0 {
|
||||
s.deps.Log.Warn("backend: could not fetch kokoro voices, using built-in list", "err", err)
|
||||
return kokoroVoices
|
||||
}
|
||||
|
||||
s.voiceMu.Lock()
|
||||
s.cachedVoices = list
|
||||
s.voiceMu.Unlock()
|
||||
s.deps.Log.Info("backend: fetched kokoro voices", "count", len(list))
|
||||
return list
|
||||
}
|
||||
|
||||
// handleHealth handles GET /health.
|
||||
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, 0, map[string]string{
|
||||
"status": "ok",
|
||||
"version": s.cfg.Version,
|
||||
"commit": s.cfg.Commit,
|
||||
})
|
||||
}
|
||||
|
||||
// handleVersion handles GET /api/version.
|
||||
func (s *Server) handleVersion(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, 0, map[string]string{
|
||||
"version": s.cfg.Version,
|
||||
"commit": s.cfg.Commit,
|
||||
})
|
||||
}
|
||||
143
backend/internal/bookstore/bookstore.go
Normal file
143
backend/internal/bookstore/bookstore.go
Normal file
@@ -0,0 +1,143 @@
|
||||
// Package bookstore defines the segregated read/write interfaces for book,
|
||||
// chapter, ranking, progress, audio, and presign data.
|
||||
//
|
||||
// Interface segregation:
|
||||
// - BookWriter — used by the runner to persist scraped data.
|
||||
// - BookReader — used by the backend to serve book/chapter data.
|
||||
// - RankingStore — used by both runner (write) and backend (read).
|
||||
// - PresignStore — used only by the backend for URL signing.
|
||||
// - AudioStore — used by the runner to store audio; backend for presign.
|
||||
// - ProgressStore— used only by the backend for reading progress.
|
||||
//
|
||||
// Concrete implementations live in internal/storage.
|
||||
package bookstore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// BookWriter is the write side used by the runner after scraping a book.
|
||||
type BookWriter interface {
|
||||
// WriteMetadata upserts all bibliographic fields for a book.
|
||||
WriteMetadata(ctx context.Context, meta domain.BookMeta) error
|
||||
|
||||
// WriteChapter stores a fully-scraped chapter's text in MinIO and
|
||||
// updates the chapters_idx record in PocketBase.
|
||||
WriteChapter(ctx context.Context, slug string, chapter domain.Chapter) error
|
||||
|
||||
// WriteChapterRefs persists chapter metadata (number + title) into
|
||||
// chapters_idx without fetching or storing chapter text.
|
||||
WriteChapterRefs(ctx context.Context, slug string, refs []domain.ChapterRef) error
|
||||
|
||||
// ChapterExists returns true if the markdown object for ref already exists.
|
||||
ChapterExists(ctx context.Context, slug string, ref domain.ChapterRef) bool
|
||||
}
|
||||
|
||||
// BookReader is the read side used by the backend to serve content.
|
||||
type BookReader interface {
|
||||
// ReadMetadata returns the metadata for slug.
|
||||
// Returns (zero, false, nil) when not found.
|
||||
ReadMetadata(ctx context.Context, slug string) (domain.BookMeta, bool, error)
|
||||
|
||||
// ListBooks returns all books sorted alphabetically by title.
|
||||
ListBooks(ctx context.Context) ([]domain.BookMeta, error)
|
||||
|
||||
// LocalSlugs returns the set of slugs that have metadata stored.
|
||||
LocalSlugs(ctx context.Context) (map[string]bool, error)
|
||||
|
||||
// MetadataMtime returns the Unix-second mtime of the metadata record, or 0.
|
||||
MetadataMtime(ctx context.Context, slug string) int64
|
||||
|
||||
// ReadChapter returns the raw markdown for chapter number n.
|
||||
ReadChapter(ctx context.Context, slug string, n int) (string, error)
|
||||
|
||||
// ListChapters returns all stored chapters for slug, sorted by number.
|
||||
ListChapters(ctx context.Context, slug string) ([]domain.ChapterInfo, error)
|
||||
|
||||
// CountChapters returns the count of stored chapters.
|
||||
CountChapters(ctx context.Context, slug string) int
|
||||
|
||||
// ReindexChapters rebuilds chapters_idx from MinIO objects for slug.
|
||||
ReindexChapters(ctx context.Context, slug string) (int, error)
|
||||
}
|
||||
|
||||
// RankingStore covers ranking reads and writes.
|
||||
type RankingStore interface {
|
||||
// WriteRankingItem upserts a single ranking entry (keyed on Slug).
|
||||
WriteRankingItem(ctx context.Context, item domain.RankingItem) error
|
||||
|
||||
// ReadRankingItems returns all ranking items sorted by rank ascending.
|
||||
ReadRankingItems(ctx context.Context) ([]domain.RankingItem, error)
|
||||
|
||||
// RankingFreshEnough returns true when ranking rows exist and the most
|
||||
// recent Updated timestamp is within maxAge.
|
||||
RankingFreshEnough(ctx context.Context, maxAge time.Duration) (bool, error)
|
||||
}
|
||||
|
||||
// AudioStore covers audio object storage (runner writes; backend reads).
|
||||
type AudioStore interface {
|
||||
// AudioObjectKey returns the MinIO object key for a cached audio file.
|
||||
AudioObjectKey(slug string, n int, voice string) string
|
||||
|
||||
// AudioExists returns true when the audio object is present in MinIO.
|
||||
AudioExists(ctx context.Context, key string) bool
|
||||
|
||||
// PutAudio stores raw audio bytes under the given MinIO object key.
|
||||
PutAudio(ctx context.Context, key string, data []byte) error
|
||||
}
|
||||
|
||||
// PresignStore generates short-lived URLs — used exclusively by the backend.
|
||||
type PresignStore interface {
|
||||
// PresignChapter returns a presigned GET URL for a chapter markdown object.
|
||||
PresignChapter(ctx context.Context, slug string, n int, expires time.Duration) (string, error)
|
||||
|
||||
// PresignAudio returns a presigned GET URL for an audio object.
|
||||
PresignAudio(ctx context.Context, key string, expires time.Duration) (string, error)
|
||||
|
||||
// PresignAvatarUpload returns a short-lived presigned PUT URL for uploading
|
||||
// an avatar image. ext should be "jpg", "png", or "webp".
|
||||
PresignAvatarUpload(ctx context.Context, userID, ext string) (uploadURL, key string, err error)
|
||||
|
||||
// PresignAvatarURL returns a presigned GET URL for a user's avatar.
|
||||
// Returns ("", false, nil) when no avatar exists.
|
||||
PresignAvatarURL(ctx context.Context, userID string) (string, bool, error)
|
||||
|
||||
// PutAvatar stores raw image bytes for a user avatar directly in MinIO.
|
||||
// ext should be "jpg", "png", or "webp". Returns the object key.
|
||||
PutAvatar(ctx context.Context, userID, ext, contentType string, data []byte) (key string, err error)
|
||||
|
||||
// DeleteAvatar removes all avatar objects for a user.
|
||||
DeleteAvatar(ctx context.Context, userID string) error
|
||||
}
|
||||
|
||||
// ProgressStore covers per-session reading progress — backend only.
|
||||
type ProgressStore interface {
|
||||
// GetProgress returns the reading progress for the given session + slug.
|
||||
GetProgress(ctx context.Context, sessionID, slug string) (domain.ReadingProgress, bool)
|
||||
|
||||
// SetProgress saves or updates reading progress.
|
||||
SetProgress(ctx context.Context, sessionID string, p domain.ReadingProgress) error
|
||||
|
||||
// AllProgress returns all progress entries for a session.
|
||||
AllProgress(ctx context.Context, sessionID string) ([]domain.ReadingProgress, error)
|
||||
|
||||
// DeleteProgress removes progress for a specific slug.
|
||||
DeleteProgress(ctx context.Context, sessionID, slug string) error
|
||||
}
|
||||
|
||||
// CoverStore covers book cover image storage in MinIO.
|
||||
// The runner writes covers during catalogue refresh; the backend reads them.
|
||||
type CoverStore interface {
|
||||
// PutCover stores a raw cover image for a book identified by slug.
|
||||
PutCover(ctx context.Context, slug string, data []byte, contentType string) error
|
||||
|
||||
// GetCover retrieves the cover image for a book. Returns (nil, false, nil)
|
||||
// when no cover exists for the given slug.
|
||||
GetCover(ctx context.Context, slug string) ([]byte, string, bool, error)
|
||||
|
||||
// CoverExists returns true when a cover image is stored for slug.
|
||||
CoverExists(ctx context.Context, slug string) bool
|
||||
}
|
||||
141
backend/internal/bookstore/bookstore_test.go
Normal file
141
backend/internal/bookstore/bookstore_test.go
Normal file
@@ -0,0 +1,141 @@
|
||||
package bookstore_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// ── Mock that satisfies all bookstore interfaces ──────────────────────────────
|
||||
|
||||
type mockStore struct{}
|
||||
|
||||
// BookWriter
|
||||
func (m *mockStore) WriteMetadata(_ context.Context, _ domain.BookMeta) error { return nil }
|
||||
func (m *mockStore) WriteChapter(_ context.Context, _ string, _ domain.Chapter) error { return nil }
|
||||
func (m *mockStore) WriteChapterRefs(_ context.Context, _ string, _ []domain.ChapterRef) error {
|
||||
return nil
|
||||
}
|
||||
func (m *mockStore) ChapterExists(_ context.Context, _ string, _ domain.ChapterRef) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// BookReader
|
||||
func (m *mockStore) ReadMetadata(_ context.Context, _ string) (domain.BookMeta, bool, error) {
|
||||
return domain.BookMeta{}, false, nil
|
||||
}
|
||||
func (m *mockStore) ListBooks(_ context.Context) ([]domain.BookMeta, error) { return nil, nil }
|
||||
func (m *mockStore) LocalSlugs(_ context.Context) (map[string]bool, error) {
|
||||
return map[string]bool{}, nil
|
||||
}
|
||||
func (m *mockStore) MetadataMtime(_ context.Context, _ string) int64 { return 0 }
|
||||
func (m *mockStore) ReadChapter(_ context.Context, _ string, _ int) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (m *mockStore) ListChapters(_ context.Context, _ string) ([]domain.ChapterInfo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) CountChapters(_ context.Context, _ string) int { return 0 }
|
||||
func (m *mockStore) ReindexChapters(_ context.Context, _ string) (int, error) { return 0, nil }
|
||||
|
||||
// RankingStore
|
||||
func (m *mockStore) WriteRankingItem(_ context.Context, _ domain.RankingItem) error { return nil }
|
||||
func (m *mockStore) ReadRankingItems(_ context.Context) ([]domain.RankingItem, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) RankingFreshEnough(_ context.Context, _ time.Duration) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// AudioStore
|
||||
func (m *mockStore) AudioObjectKey(_ string, _ int, _ string) string { return "" }
|
||||
func (m *mockStore) AudioExists(_ context.Context, _ string) bool { return false }
|
||||
func (m *mockStore) PutAudio(_ context.Context, _ string, _ []byte) error { return nil }
|
||||
|
||||
// PresignStore
|
||||
func (m *mockStore) PresignChapter(_ context.Context, _ string, _ int, _ time.Duration) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (m *mockStore) PresignAudio(_ context.Context, _ string, _ time.Duration) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (m *mockStore) PresignAvatarUpload(_ context.Context, _, _ string) (string, string, error) {
|
||||
return "", "", nil
|
||||
}
|
||||
func (m *mockStore) PresignAvatarURL(_ context.Context, _ string) (string, bool, error) {
|
||||
return "", false, nil
|
||||
}
|
||||
func (m *mockStore) PutAvatar(_ context.Context, _, _, _ string, _ []byte) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (m *mockStore) DeleteAvatar(_ context.Context, _ string) error { return nil }
|
||||
|
||||
// ProgressStore
|
||||
func (m *mockStore) GetProgress(_ context.Context, _, _ string) (domain.ReadingProgress, bool) {
|
||||
return domain.ReadingProgress{}, false
|
||||
}
|
||||
func (m *mockStore) SetProgress(_ context.Context, _ string, _ domain.ReadingProgress) error {
|
||||
return nil
|
||||
}
|
||||
func (m *mockStore) AllProgress(_ context.Context, _ string) ([]domain.ReadingProgress, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) DeleteProgress(_ context.Context, _, _ string) error { return nil }
|
||||
|
||||
// ── Compile-time interface satisfaction ───────────────────────────────────────
|
||||
|
||||
var _ bookstore.BookWriter = (*mockStore)(nil)
|
||||
var _ bookstore.BookReader = (*mockStore)(nil)
|
||||
var _ bookstore.RankingStore = (*mockStore)(nil)
|
||||
var _ bookstore.AudioStore = (*mockStore)(nil)
|
||||
var _ bookstore.PresignStore = (*mockStore)(nil)
|
||||
var _ bookstore.ProgressStore = (*mockStore)(nil)
|
||||
|
||||
// ── Behavioural tests ─────────────────────────────────────────────────────────
|
||||
|
||||
func TestBookWriter_WriteMetadata_ReturnsNilError(t *testing.T) {
|
||||
var w bookstore.BookWriter = &mockStore{}
|
||||
if err := w.WriteMetadata(context.Background(), domain.BookMeta{Slug: "test"}); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBookReader_ReadMetadata_NotFound(t *testing.T) {
|
||||
var r bookstore.BookReader = &mockStore{}
|
||||
_, found, err := r.ReadMetadata(context.Background(), "unknown")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if found {
|
||||
t.Error("expected not found")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankingStore_RankingFreshEnough_ReturnsFalse(t *testing.T) {
|
||||
var s bookstore.RankingStore = &mockStore{}
|
||||
fresh, err := s.RankingFreshEnough(context.Background(), time.Hour)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if fresh {
|
||||
t.Error("expected false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioStore_AudioExists_ReturnsFalse(t *testing.T) {
|
||||
var s bookstore.AudioStore = &mockStore{}
|
||||
if s.AudioExists(context.Background(), "audio/slug/1/af_bella.mp3") {
|
||||
t.Error("expected false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestProgressStore_GetProgress_NotFound(t *testing.T) {
|
||||
var s bookstore.ProgressStore = &mockStore{}
|
||||
_, found := s.GetProgress(context.Background(), "session-1", "slug")
|
||||
if found {
|
||||
t.Error("expected not found")
|
||||
}
|
||||
}
|
||||
191
backend/internal/browser/browser.go
Normal file
191
backend/internal/browser/browser.go
Normal file
@@ -0,0 +1,191 @@
|
||||
// Package browser provides a rate-limited HTTP client for web scraping.
|
||||
package browser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ErrRateLimit is returned by GetContent when the server responds with 429.
|
||||
// It carries the suggested retry delay (from Retry-After header, or a default).
|
||||
var ErrRateLimit = errors.New("rate limited (429)")
|
||||
|
||||
// RateLimitError wraps ErrRateLimit and carries the suggested wait duration.
|
||||
type RateLimitError struct {
|
||||
// RetryAfter is how long the caller should wait before retrying.
|
||||
// Derived from the Retry-After response header when present; otherwise a default.
|
||||
RetryAfter time.Duration
|
||||
}
|
||||
|
||||
func (e *RateLimitError) Error() string {
|
||||
return fmt.Sprintf("rate limited (429): retry after %s", e.RetryAfter)
|
||||
}
|
||||
|
||||
func (e *RateLimitError) Is(target error) bool { return target == ErrRateLimit }
|
||||
|
||||
// defaultRateLimitDelay is used when the server returns 429 with no Retry-After header.
|
||||
const defaultRateLimitDelay = 60 * time.Second
|
||||
|
||||
// Client is the interface used by scrapers to fetch raw page HTML.
|
||||
// Implementations must be safe for concurrent use.
|
||||
type Client interface {
|
||||
// GetContent fetches the URL and returns the full response body as a string.
|
||||
// It should respect the provided context for cancellation and timeouts.
|
||||
GetContent(ctx context.Context, pageURL string) (string, error)
|
||||
}
|
||||
|
||||
// Config holds tunable parameters for the direct HTTP client.
|
||||
type Config struct {
|
||||
// MaxConcurrent limits the number of simultaneous in-flight requests.
|
||||
// Defaults to 5 when 0.
|
||||
MaxConcurrent int
|
||||
// Timeout is the per-request deadline. Defaults to 90s when 0.
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
// DirectClient is a plain net/http-based Client with a concurrency semaphore.
|
||||
type DirectClient struct {
|
||||
http *http.Client
|
||||
semaphore chan struct{}
|
||||
}
|
||||
|
||||
// NewDirectClient returns a DirectClient configured by cfg.
|
||||
func NewDirectClient(cfg Config) *DirectClient {
|
||||
if cfg.MaxConcurrent <= 0 {
|
||||
cfg.MaxConcurrent = 5
|
||||
}
|
||||
if cfg.Timeout <= 0 {
|
||||
cfg.Timeout = 90 * time.Second
|
||||
}
|
||||
|
||||
transport := &http.Transport{
|
||||
MaxIdleConnsPerHost: cfg.MaxConcurrent * 2,
|
||||
DisableCompression: false,
|
||||
}
|
||||
|
||||
return &DirectClient{
|
||||
http: &http.Client{
|
||||
Transport: transport,
|
||||
Timeout: cfg.Timeout,
|
||||
},
|
||||
semaphore: make(chan struct{}, cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
// GetContent fetches pageURL respecting the concurrency limit.
|
||||
func (c *DirectClient) GetContent(ctx context.Context, pageURL string) (string, error) {
|
||||
// Acquire semaphore slot.
|
||||
select {
|
||||
case c.semaphore <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
}
|
||||
defer func() { <-c.semaphore }()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pageURL, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("browser: build request %s: %w", pageURL, err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; libnovel-runner/2)")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("browser: GET %s: %w", pageURL, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusTooManyRequests {
|
||||
delay := defaultRateLimitDelay
|
||||
if ra := resp.Header.Get("Retry-After"); ra != "" {
|
||||
if secs, err := strconv.Atoi(ra); err == nil && secs > 0 {
|
||||
delay = time.Duration(secs) * time.Second
|
||||
}
|
||||
}
|
||||
return "", &RateLimitError{RetryAfter: delay}
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return "", fmt.Errorf("browser: GET %s returned %d", pageURL, resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("browser: read body %s: %w", pageURL, err)
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
// Do implements httputil.Client so DirectClient can be passed to RetryGet.
|
||||
func (c *DirectClient) Do(req *http.Request) (*http.Response, error) {
|
||||
select {
|
||||
case c.semaphore <- struct{}{}:
|
||||
case <-req.Context().Done():
|
||||
return nil, req.Context().Err()
|
||||
}
|
||||
defer func() { <-c.semaphore }()
|
||||
return c.http.Do(req)
|
||||
}
|
||||
|
||||
// ── Stub for testing ──────────────────────────────────────────────────────────
|
||||
|
||||
// StubClient is a test double for Client. It returns pre-configured responses
|
||||
// keyed on URL. Calls to unknown URLs return an error.
|
||||
type StubClient struct {
|
||||
mu sync.Mutex
|
||||
pages map[string]string
|
||||
errors map[string]error
|
||||
callLog []string
|
||||
}
|
||||
|
||||
// NewStub creates a StubClient with no pages pre-loaded.
|
||||
func NewStub() *StubClient {
|
||||
return &StubClient{
|
||||
pages: make(map[string]string),
|
||||
errors: make(map[string]error),
|
||||
}
|
||||
}
|
||||
|
||||
// SetPage registers a URL → HTML body mapping.
|
||||
func (s *StubClient) SetPage(u, html string) {
|
||||
s.mu.Lock()
|
||||
s.pages[u] = html
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
// SetError registers a URL → error mapping (returned instead of a body).
|
||||
func (s *StubClient) SetError(u string, err error) {
|
||||
s.mu.Lock()
|
||||
s.errors[u] = err
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
// CallLog returns the ordered list of URLs that were requested.
|
||||
func (s *StubClient) CallLog() []string {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
out := make([]string, len(s.callLog))
|
||||
copy(out, s.callLog)
|
||||
return out
|
||||
}
|
||||
|
||||
// GetContent returns the registered page or an error for the URL.
|
||||
func (s *StubClient) GetContent(_ context.Context, pageURL string) (string, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.callLog = append(s.callLog, pageURL)
|
||||
if err, ok := s.errors[pageURL]; ok {
|
||||
return "", err
|
||||
}
|
||||
if html, ok := s.pages[pageURL]; ok {
|
||||
return html, nil
|
||||
}
|
||||
return "", fmt.Errorf("stub: no page registered for %q", pageURL)
|
||||
}
|
||||
141
backend/internal/browser/browser_test.go
Normal file
141
backend/internal/browser/browser_test.go
Normal file
@@ -0,0 +1,141 @@
|
||||
package browser_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/browser"
|
||||
)
|
||||
|
||||
func TestDirectClient_GetContent_Success(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Write([]byte("<html>hello</html>"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := browser.NewDirectClient(browser.Config{MaxConcurrent: 2, Timeout: 5 * time.Second})
|
||||
body, err := c.GetContent(context.Background(), srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if body != "<html>hello</html>" {
|
||||
t.Errorf("want <html>hello</html>, got %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDirectClient_GetContent_4xxReturnsError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := browser.NewDirectClient(browser.Config{})
|
||||
_, err := c.GetContent(context.Background(), srv.URL)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for 404")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDirectClient_SemaphoreBlocksConcurrency(t *testing.T) {
|
||||
const maxConcurrent = 2
|
||||
var inflight atomic.Int32
|
||||
var peak atomic.Int32
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
n := inflight.Add(1)
|
||||
if int(n) > int(peak.Load()) {
|
||||
peak.Store(n)
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
inflight.Add(-1)
|
||||
w.Write([]byte("ok"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := browser.NewDirectClient(browser.Config{MaxConcurrent: maxConcurrent, Timeout: 5 * time.Second})
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < 8; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
c.GetContent(context.Background(), srv.URL)
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if int(peak.Load()) > maxConcurrent {
|
||||
t.Errorf("concurrent requests exceeded limit: peak=%d, limit=%d", peak.Load(), maxConcurrent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDirectClient_ContextCancel(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
w.Write([]byte("ok"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel before making the request
|
||||
|
||||
c := browser.NewDirectClient(browser.Config{})
|
||||
_, err := c.GetContent(ctx, srv.URL)
|
||||
if err == nil {
|
||||
t.Fatal("expected context cancellation error")
|
||||
}
|
||||
}
|
||||
|
||||
// ── StubClient ────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestStubClient_ReturnsRegisteredPage(t *testing.T) {
|
||||
stub := browser.NewStub()
|
||||
stub.SetPage("http://example.com/page1", "<html>page1</html>")
|
||||
|
||||
body, err := stub.GetContent(context.Background(), "http://example.com/page1")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if body != "<html>page1</html>" {
|
||||
t.Errorf("want page1 html, got %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubClient_ReturnsRegisteredError(t *testing.T) {
|
||||
stub := browser.NewStub()
|
||||
want := errors.New("network failure")
|
||||
stub.SetError("http://example.com/bad", want)
|
||||
|
||||
_, err := stub.GetContent(context.Background(), "http://example.com/bad")
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubClient_UnknownURLReturnsError(t *testing.T) {
|
||||
stub := browser.NewStub()
|
||||
_, err := stub.GetContent(context.Background(), "http://unknown.example.com/")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for unknown URL")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubClient_CallLog(t *testing.T) {
|
||||
stub := browser.NewStub()
|
||||
stub.SetPage("http://example.com/a", "a")
|
||||
stub.SetPage("http://example.com/b", "b")
|
||||
|
||||
stub.GetContent(context.Background(), "http://example.com/a")
|
||||
stub.GetContent(context.Background(), "http://example.com/b")
|
||||
|
||||
log := stub.CallLog()
|
||||
if len(log) != 2 || log[0] != "http://example.com/a" || log[1] != "http://example.com/b" {
|
||||
t.Errorf("unexpected call log: %v", log)
|
||||
}
|
||||
}
|
||||
225
backend/internal/config/config.go
Normal file
225
backend/internal/config/config.go
Normal file
@@ -0,0 +1,225 @@
|
||||
// Package config loads all service configuration from environment variables.
|
||||
// Both the runner and backend binaries call config.Load() at startup; each
|
||||
// uses only the sub-struct relevant to it.
|
||||
//
|
||||
// Every field has a documented default so the service starts sensibly without
|
||||
// any environment configuration (useful for local development).
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PocketBase holds connection settings for the remote PocketBase instance.
|
||||
type PocketBase struct {
|
||||
// URL is the base URL of the PocketBase instance, e.g. https://pb.libnovel.cc
|
||||
URL string
|
||||
// AdminEmail is the admin account email used for API authentication.
|
||||
AdminEmail string
|
||||
// AdminPassword is the admin account password.
|
||||
AdminPassword string
|
||||
}
|
||||
|
||||
// MinIO holds connection settings for the remote MinIO / S3-compatible store.
|
||||
type MinIO struct {
|
||||
// Endpoint is the host:port of the MinIO S3 API, e.g. storage.libnovel.cc:443
|
||||
Endpoint string
|
||||
// PublicEndpoint is the browser-visible endpoint used for presigned URLs.
|
||||
// Falls back to Endpoint when empty.
|
||||
PublicEndpoint string
|
||||
// AccessKey is the MinIO access key.
|
||||
AccessKey string
|
||||
// SecretKey is the MinIO secret key.
|
||||
SecretKey string
|
||||
// UseSSL enables TLS for the internal MinIO connection.
|
||||
UseSSL bool
|
||||
// PublicUseSSL enables TLS for presigned URL generation.
|
||||
PublicUseSSL bool
|
||||
// BucketChapters is the bucket that holds chapter markdown objects.
|
||||
BucketChapters string
|
||||
// BucketAudio is the bucket that holds generated audio MP3 objects.
|
||||
BucketAudio string
|
||||
// BucketAvatars is the bucket that holds user avatar images.
|
||||
BucketAvatars string
|
||||
// BucketBrowse is the bucket that holds cached browse page snapshots (JSON).
|
||||
BucketBrowse string
|
||||
}
|
||||
|
||||
// Kokoro holds connection settings for the Kokoro-FastAPI TTS service.
|
||||
type Kokoro struct {
|
||||
// URL is the base URL of the Kokoro service, e.g. https://kokoro.libnovel.cc
|
||||
// An empty string disables TTS generation.
|
||||
URL string
|
||||
// DefaultVoice is the voice used when none is specified.
|
||||
DefaultVoice string
|
||||
}
|
||||
|
||||
// HTTP holds settings for the HTTP server (backend only).
|
||||
type HTTP struct {
|
||||
// Addr is the listen address, e.g. ":8080"
|
||||
Addr string
|
||||
}
|
||||
|
||||
// Meilisearch holds connection settings for the Meilisearch full-text search service.
|
||||
type Meilisearch struct {
|
||||
// URL is the base URL of the Meilisearch instance, e.g. http://localhost:7700
|
||||
// An empty string disables Meilisearch indexing and search.
|
||||
URL string
|
||||
// APIKey is the Meilisearch master/search API key.
|
||||
APIKey string
|
||||
}
|
||||
|
||||
// Valkey holds connection settings for the Valkey/Redis presign URL cache.
|
||||
type Valkey struct {
|
||||
// Addr is the host:port of the Valkey instance, e.g. localhost:6379
|
||||
// An empty string disables the Valkey cache (falls through to MinIO directly).
|
||||
Addr string
|
||||
}
|
||||
|
||||
// Runner holds settings specific to the runner/worker binary.
|
||||
type Runner struct {
|
||||
// PollInterval is how often the runner checks PocketBase for pending tasks.
|
||||
PollInterval time.Duration
|
||||
// MaxConcurrentScrape limits simultaneous book-scrape goroutines.
|
||||
MaxConcurrentScrape int
|
||||
// MaxConcurrentAudio limits simultaneous audio-generation goroutines.
|
||||
MaxConcurrentAudio int
|
||||
// WorkerID is a unique identifier for this runner instance.
|
||||
// Defaults to the system hostname.
|
||||
WorkerID string
|
||||
// Workers is the number of chapter-scraping goroutines per book.
|
||||
Workers int
|
||||
// Timeout is the per-request HTTP timeout for scraping.
|
||||
Timeout time.Duration
|
||||
// MetricsAddr is the listen address for the runner /metrics HTTP endpoint.
|
||||
// Defaults to ":9091". Set to "" to disable.
|
||||
MetricsAddr string
|
||||
// CatalogueRefreshInterval is how often the runner walks the full catalogue,
|
||||
// scrapes per-book metadata, downloads covers, and re-indexes in Meilisearch.
|
||||
// Defaults to 24h. Set to 0 to use the default.
|
||||
CatalogueRefreshInterval time.Duration
|
||||
// SkipInitialCatalogueRefresh prevents the runner from running a full
|
||||
// catalogue walk on startup. Useful for quick restarts where the catalogue
|
||||
// is already indexed and a 24h walk would be wasteful.
|
||||
// Controlled by RUNNER_SKIP_INITIAL_CATALOGUE_REFRESH=true.
|
||||
SkipInitialCatalogueRefresh bool
|
||||
}
|
||||
|
||||
// Config is the top-level configuration struct consumed by both binaries.
|
||||
type Config struct {
|
||||
PocketBase PocketBase
|
||||
MinIO MinIO
|
||||
Kokoro Kokoro
|
||||
HTTP HTTP
|
||||
Runner Runner
|
||||
Meilisearch Meilisearch
|
||||
Valkey Valkey
|
||||
// LogLevel is one of "debug", "info", "warn", "error".
|
||||
LogLevel string
|
||||
}
|
||||
|
||||
// Load reads all configuration from environment variables and returns a
|
||||
// populated Config. Missing variables fall back to documented defaults.
|
||||
func Load() Config {
|
||||
workerID, _ := os.Hostname()
|
||||
if workerID == "" {
|
||||
workerID = "runner-default"
|
||||
}
|
||||
|
||||
return Config{
|
||||
LogLevel: envOr("LOG_LEVEL", "info"),
|
||||
|
||||
PocketBase: PocketBase{
|
||||
URL: envOr("POCKETBASE_URL", "http://localhost:8090"),
|
||||
AdminEmail: envOr("POCKETBASE_ADMIN_EMAIL", "admin@libnovel.local"),
|
||||
AdminPassword: envOr("POCKETBASE_ADMIN_PASSWORD", "changeme123"),
|
||||
},
|
||||
|
||||
MinIO: MinIO{
|
||||
Endpoint: envOr("MINIO_ENDPOINT", "localhost:9000"),
|
||||
PublicEndpoint: envOr("MINIO_PUBLIC_ENDPOINT", ""),
|
||||
AccessKey: envOr("MINIO_ACCESS_KEY", "admin"),
|
||||
SecretKey: envOr("MINIO_SECRET_KEY", "changeme123"),
|
||||
UseSSL: envBool("MINIO_USE_SSL", false),
|
||||
PublicUseSSL: envBool("MINIO_PUBLIC_USE_SSL", true),
|
||||
BucketChapters: envOr("MINIO_BUCKET_CHAPTERS", "chapters"),
|
||||
BucketAudio: envOr("MINIO_BUCKET_AUDIO", "audio"),
|
||||
BucketAvatars: envOr("MINIO_BUCKET_AVATARS", "avatars"),
|
||||
BucketBrowse: envOr("MINIO_BUCKET_BROWSE", "catalogue"),
|
||||
},
|
||||
|
||||
Kokoro: Kokoro{
|
||||
URL: envOr("KOKORO_URL", ""),
|
||||
DefaultVoice: envOr("KOKORO_VOICE", "af_bella"),
|
||||
},
|
||||
|
||||
HTTP: HTTP{
|
||||
Addr: envOr("BACKEND_HTTP_ADDR", ":8080"),
|
||||
},
|
||||
|
||||
Runner: Runner{
|
||||
PollInterval: envDuration("RUNNER_POLL_INTERVAL", 30*time.Second),
|
||||
MaxConcurrentScrape: envInt("RUNNER_MAX_CONCURRENT_SCRAPE", 1),
|
||||
MaxConcurrentAudio: envInt("RUNNER_MAX_CONCURRENT_AUDIO", 1),
|
||||
WorkerID: envOr("RUNNER_WORKER_ID", workerID),
|
||||
Workers: envInt("RUNNER_WORKERS", 0), // 0 → runtime.NumCPU()
|
||||
Timeout: envDuration("RUNNER_TIMEOUT", 90*time.Second),
|
||||
MetricsAddr: envOr("RUNNER_METRICS_ADDR", ":9091"),
|
||||
CatalogueRefreshInterval: envDuration("RUNNER_CATALOGUE_REFRESH_INTERVAL", 0),
|
||||
SkipInitialCatalogueRefresh: envBool("RUNNER_SKIP_INITIAL_CATALOGUE_REFRESH", false),
|
||||
},
|
||||
|
||||
Meilisearch: Meilisearch{
|
||||
URL: envOr("MEILI_URL", ""),
|
||||
APIKey: envOr("MEILI_API_KEY", ""),
|
||||
},
|
||||
|
||||
Valkey: Valkey{
|
||||
Addr: envOr("VALKEY_ADDR", ""),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func envOr(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func envBool(key string, fallback bool) bool {
|
||||
v := os.Getenv(key)
|
||||
if v == "" {
|
||||
return fallback
|
||||
}
|
||||
return strings.ToLower(v) == "true"
|
||||
}
|
||||
|
||||
func envInt(key string, fallback int) int {
|
||||
v := os.Getenv(key)
|
||||
if v == "" {
|
||||
return fallback
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil || n < 0 {
|
||||
return fallback
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func envDuration(key string, fallback time.Duration) time.Duration {
|
||||
v := os.Getenv(key)
|
||||
if v == "" {
|
||||
return fallback
|
||||
}
|
||||
d, err := time.ParseDuration(v)
|
||||
if err != nil {
|
||||
return fallback
|
||||
}
|
||||
return d
|
||||
}
|
||||
127
backend/internal/config/config_test.go
Normal file
127
backend/internal/config/config_test.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
)
|
||||
|
||||
func TestLoad_Defaults(t *testing.T) {
|
||||
// Unset all relevant vars so we test pure defaults.
|
||||
unset := []string{
|
||||
"LOG_LEVEL",
|
||||
"POCKETBASE_URL", "POCKETBASE_ADMIN_EMAIL", "POCKETBASE_ADMIN_PASSWORD",
|
||||
"MINIO_ENDPOINT", "MINIO_PUBLIC_ENDPOINT", "MINIO_ACCESS_KEY", "MINIO_SECRET_KEY",
|
||||
"MINIO_USE_SSL", "MINIO_PUBLIC_USE_SSL",
|
||||
"MINIO_BUCKET_CHAPTERS", "MINIO_BUCKET_AUDIO", "MINIO_BUCKET_AVATARS",
|
||||
"KOKORO_URL", "KOKORO_VOICE",
|
||||
"BACKEND_HTTP_ADDR",
|
||||
"RUNNER_POLL_INTERVAL", "RUNNER_MAX_CONCURRENT_SCRAPE", "RUNNER_MAX_CONCURRENT_AUDIO",
|
||||
"RUNNER_WORKER_ID", "RUNNER_WORKERS", "RUNNER_TIMEOUT",
|
||||
}
|
||||
for _, k := range unset {
|
||||
t.Setenv(k, "")
|
||||
}
|
||||
|
||||
cfg := config.Load()
|
||||
|
||||
if cfg.LogLevel != "info" {
|
||||
t.Errorf("LogLevel: want info, got %q", cfg.LogLevel)
|
||||
}
|
||||
if cfg.PocketBase.URL != "http://localhost:8090" {
|
||||
t.Errorf("PocketBase.URL: want http://localhost:8090, got %q", cfg.PocketBase.URL)
|
||||
}
|
||||
if cfg.MinIO.BucketChapters != "chapters" {
|
||||
t.Errorf("MinIO.BucketChapters: want chapters, got %q", cfg.MinIO.BucketChapters)
|
||||
}
|
||||
if cfg.MinIO.UseSSL != false {
|
||||
t.Errorf("MinIO.UseSSL: want false, got %v", cfg.MinIO.UseSSL)
|
||||
}
|
||||
if cfg.MinIO.PublicUseSSL != true {
|
||||
t.Errorf("MinIO.PublicUseSSL: want true, got %v", cfg.MinIO.PublicUseSSL)
|
||||
}
|
||||
if cfg.Kokoro.DefaultVoice != "af_bella" {
|
||||
t.Errorf("Kokoro.DefaultVoice: want af_bella, got %q", cfg.Kokoro.DefaultVoice)
|
||||
}
|
||||
if cfg.HTTP.Addr != ":8080" {
|
||||
t.Errorf("HTTP.Addr: want :8080, got %q", cfg.HTTP.Addr)
|
||||
}
|
||||
if cfg.Runner.PollInterval != 30*time.Second {
|
||||
t.Errorf("Runner.PollInterval: want 30s, got %v", cfg.Runner.PollInterval)
|
||||
}
|
||||
if cfg.Runner.MaxConcurrentScrape != 1 {
|
||||
t.Errorf("Runner.MaxConcurrentScrape: want 1, got %d", cfg.Runner.MaxConcurrentScrape)
|
||||
}
|
||||
if cfg.Runner.MaxConcurrentAudio != 1 {
|
||||
t.Errorf("Runner.MaxConcurrentAudio: want 1, got %d", cfg.Runner.MaxConcurrentAudio)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoad_EnvOverride(t *testing.T) {
|
||||
t.Setenv("LOG_LEVEL", "debug")
|
||||
t.Setenv("POCKETBASE_URL", "https://pb.libnovel.cc")
|
||||
t.Setenv("MINIO_USE_SSL", "true")
|
||||
t.Setenv("MINIO_PUBLIC_USE_SSL", "false")
|
||||
t.Setenv("RUNNER_POLL_INTERVAL", "1m")
|
||||
t.Setenv("RUNNER_MAX_CONCURRENT_SCRAPE", "5")
|
||||
t.Setenv("RUNNER_WORKER_ID", "homelab-01")
|
||||
t.Setenv("BACKEND_HTTP_ADDR", ":9090")
|
||||
t.Setenv("KOKORO_URL", "https://kokoro.libnovel.cc")
|
||||
|
||||
cfg := config.Load()
|
||||
|
||||
if cfg.LogLevel != "debug" {
|
||||
t.Errorf("LogLevel: want debug, got %q", cfg.LogLevel)
|
||||
}
|
||||
if cfg.PocketBase.URL != "https://pb.libnovel.cc" {
|
||||
t.Errorf("PocketBase.URL: want https://pb.libnovel.cc, got %q", cfg.PocketBase.URL)
|
||||
}
|
||||
if !cfg.MinIO.UseSSL {
|
||||
t.Error("MinIO.UseSSL: want true")
|
||||
}
|
||||
if cfg.MinIO.PublicUseSSL {
|
||||
t.Error("MinIO.PublicUseSSL: want false")
|
||||
}
|
||||
if cfg.Runner.PollInterval != time.Minute {
|
||||
t.Errorf("Runner.PollInterval: want 1m, got %v", cfg.Runner.PollInterval)
|
||||
}
|
||||
if cfg.Runner.MaxConcurrentScrape != 5 {
|
||||
t.Errorf("Runner.MaxConcurrentScrape: want 5, got %d", cfg.Runner.MaxConcurrentScrape)
|
||||
}
|
||||
if cfg.Runner.WorkerID != "homelab-01" {
|
||||
t.Errorf("Runner.WorkerID: want homelab-01, got %q", cfg.Runner.WorkerID)
|
||||
}
|
||||
if cfg.HTTP.Addr != ":9090" {
|
||||
t.Errorf("HTTP.Addr: want :9090, got %q", cfg.HTTP.Addr)
|
||||
}
|
||||
if cfg.Kokoro.URL != "https://kokoro.libnovel.cc" {
|
||||
t.Errorf("Kokoro.URL: want https://kokoro.libnovel.cc, got %q", cfg.Kokoro.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoad_InvalidInt_FallsToDefault(t *testing.T) {
|
||||
t.Setenv("RUNNER_MAX_CONCURRENT_SCRAPE", "notanumber")
|
||||
cfg := config.Load()
|
||||
if cfg.Runner.MaxConcurrentScrape != 1 {
|
||||
t.Errorf("want default 1, got %d", cfg.Runner.MaxConcurrentScrape)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoad_InvalidDuration_FallsToDefault(t *testing.T) {
|
||||
t.Setenv("RUNNER_POLL_INTERVAL", "notaduration")
|
||||
cfg := config.Load()
|
||||
if cfg.Runner.PollInterval != 30*time.Second {
|
||||
t.Errorf("want default 30s, got %v", cfg.Runner.PollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoad_WorkerID_FallsToHostname(t *testing.T) {
|
||||
t.Setenv("RUNNER_WORKER_ID", "")
|
||||
cfg := config.Load()
|
||||
host, _ := os.Hostname()
|
||||
if host != "" && cfg.Runner.WorkerID != host {
|
||||
t.Errorf("want hostname %q, got %q", host, cfg.Runner.WorkerID)
|
||||
}
|
||||
}
|
||||
137
backend/internal/domain/domain.go
Normal file
137
backend/internal/domain/domain.go
Normal file
@@ -0,0 +1,137 @@
|
||||
// Package domain contains the core value types shared across all packages
|
||||
// in this module. It has zero internal imports — only the standard library.
|
||||
// Every other package imports domain; domain imports nothing from this module.
|
||||
package domain
|
||||
|
||||
import "time"
|
||||
|
||||
// ── Book types ────────────────────────────────────────────────────────────────
|
||||
|
||||
// BookMeta carries all bibliographic information about a novel.
|
||||
type BookMeta struct {
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
Cover string `json:"cover,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
Genres []string `json:"genres,omitempty"`
|
||||
Summary string `json:"summary,omitempty"`
|
||||
TotalChapters int `json:"total_chapters,omitempty"`
|
||||
SourceURL string `json:"source_url"`
|
||||
Ranking int `json:"ranking,omitempty"`
|
||||
Rating float64 `json:"rating,omitempty"`
|
||||
// MetaUpdated is the Unix timestamp (seconds) when the book record was last
|
||||
// updated in PocketBase. Populated on read; not sent on write (PocketBase
|
||||
// manages its own updated field).
|
||||
MetaUpdated int64 `json:"meta_updated,omitempty"`
|
||||
}
|
||||
|
||||
// CatalogueEntry is a lightweight book reference returned by catalogue pages.
|
||||
type CatalogueEntry struct {
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
// ChapterRef is a reference to a single chapter returned by chapter-list pages.
|
||||
type ChapterRef struct {
|
||||
Number int `json:"number"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url"`
|
||||
Volume int `json:"volume,omitempty"`
|
||||
}
|
||||
|
||||
// Chapter contains the fully-extracted text of a single chapter.
|
||||
type Chapter struct {
|
||||
Ref ChapterRef `json:"ref"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// RankingItem represents a single entry in the novel ranking list.
|
||||
type RankingItem struct {
|
||||
Rank int `json:"rank"`
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author,omitempty"`
|
||||
Cover string `json:"cover,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
Genres []string `json:"genres,omitempty"`
|
||||
SourceURL string `json:"source_url,omitempty"`
|
||||
Updated time.Time `json:"updated,omitempty"`
|
||||
}
|
||||
|
||||
// ── Storage record types ──────────────────────────────────────────────────────
|
||||
|
||||
// ChapterInfo is a lightweight chapter descriptor stored in the index.
|
||||
type ChapterInfo struct {
|
||||
Number int `json:"number"`
|
||||
Title string `json:"title"`
|
||||
Date string `json:"date,omitempty"`
|
||||
}
|
||||
|
||||
// ReadingProgress holds a single user's reading position for one book.
|
||||
type ReadingProgress struct {
|
||||
Slug string `json:"slug"`
|
||||
Chapter int `json:"chapter"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// ── Task record types ─────────────────────────────────────────────────────────
|
||||
|
||||
// TaskStatus enumerates the lifecycle states of any task.
|
||||
type TaskStatus string
|
||||
|
||||
const (
|
||||
TaskStatusPending TaskStatus = "pending"
|
||||
TaskStatusRunning TaskStatus = "running"
|
||||
TaskStatusDone TaskStatus = "done"
|
||||
TaskStatusFailed TaskStatus = "failed"
|
||||
TaskStatusCancelled TaskStatus = "cancelled"
|
||||
)
|
||||
|
||||
// ScrapeTask represents a book-scraping job stored in PocketBase.
|
||||
type ScrapeTask struct {
|
||||
ID string `json:"id"`
|
||||
Kind string `json:"kind"` // "catalogue" | "book" | "book_range"
|
||||
TargetURL string `json:"target_url"` // non-empty for single-book tasks
|
||||
FromChapter int `json:"from_chapter,omitempty"`
|
||||
ToChapter int `json:"to_chapter,omitempty"`
|
||||
WorkerID string `json:"worker_id,omitempty"`
|
||||
Status TaskStatus `json:"status"`
|
||||
BooksFound int `json:"books_found"`
|
||||
ChaptersScraped int `json:"chapters_scraped"`
|
||||
ChaptersSkipped int `json:"chapters_skipped"`
|
||||
Errors int `json:"errors"`
|
||||
Started time.Time `json:"started"`
|
||||
Finished time.Time `json:"finished,omitempty"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResult is the outcome reported by the runner after finishing a ScrapeTask.
|
||||
type ScrapeResult struct {
|
||||
BooksFound int `json:"books_found"`
|
||||
ChaptersScraped int `json:"chapters_scraped"`
|
||||
ChaptersSkipped int `json:"chapters_skipped"`
|
||||
Errors int `json:"errors"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
}
|
||||
|
||||
// AudioTask represents an audio-generation job stored in PocketBase.
|
||||
type AudioTask struct {
|
||||
ID string `json:"id"`
|
||||
CacheKey string `json:"cache_key"` // "slug/chapter/voice"
|
||||
Slug string `json:"slug"`
|
||||
Chapter int `json:"chapter"`
|
||||
Voice string `json:"voice"`
|
||||
WorkerID string `json:"worker_id,omitempty"`
|
||||
Status TaskStatus `json:"status"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
Started time.Time `json:"started"`
|
||||
Finished time.Time `json:"finished,omitempty"`
|
||||
}
|
||||
|
||||
// AudioResult is the outcome reported by the runner after finishing an AudioTask.
|
||||
type AudioResult struct {
|
||||
ObjectKey string `json:"object_key,omitempty"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
}
|
||||
104
backend/internal/domain/domain_test.go
Normal file
104
backend/internal/domain/domain_test.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package domain_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
func TestBookMeta_JSONRoundtrip(t *testing.T) {
|
||||
orig := domain.BookMeta{
|
||||
Slug: "a-great-novel",
|
||||
Title: "A Great Novel",
|
||||
Author: "Jane Doe",
|
||||
Cover: "https://example.com/cover.jpg",
|
||||
Status: "Ongoing",
|
||||
Genres: []string{"Fantasy", "Action"},
|
||||
Summary: "A thrilling tale.",
|
||||
TotalChapters: 120,
|
||||
SourceURL: "https://novelfire.net/book/a-great-novel",
|
||||
Ranking: 3,
|
||||
}
|
||||
|
||||
b, err := json.Marshal(orig)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
var got domain.BookMeta
|
||||
if err := json.Unmarshal(b, &got); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
if got.Slug != orig.Slug {
|
||||
t.Errorf("Slug: want %q, got %q", orig.Slug, got.Slug)
|
||||
}
|
||||
if got.TotalChapters != orig.TotalChapters {
|
||||
t.Errorf("TotalChapters: want %d, got %d", orig.TotalChapters, got.TotalChapters)
|
||||
}
|
||||
if len(got.Genres) != len(orig.Genres) {
|
||||
t.Errorf("Genres len: want %d, got %d", len(orig.Genres), len(got.Genres))
|
||||
}
|
||||
}
|
||||
|
||||
func TestChapterRef_JSONRoundtrip(t *testing.T) {
|
||||
orig := domain.ChapterRef{Number: 42, Title: "The Battle", URL: "https://example.com/ch-42", Volume: 2}
|
||||
b, _ := json.Marshal(orig)
|
||||
var got domain.ChapterRef
|
||||
json.Unmarshal(b, &got)
|
||||
if got != orig {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankingItem_JSONRoundtrip(t *testing.T) {
|
||||
now := time.Now().Truncate(time.Second)
|
||||
orig := domain.RankingItem{
|
||||
Rank: 1,
|
||||
Slug: "top-novel",
|
||||
Title: "Top Novel",
|
||||
SourceURL: "https://novelfire.net/book/top-novel",
|
||||
Updated: now,
|
||||
}
|
||||
b, _ := json.Marshal(orig)
|
||||
var got domain.RankingItem
|
||||
json.Unmarshal(b, &got)
|
||||
if got.Rank != orig.Rank || got.Slug != orig.Slug {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapeResult_JSONRoundtrip(t *testing.T) {
|
||||
orig := domain.ScrapeResult{BooksFound: 10, ChaptersScraped: 200, ChaptersSkipped: 5, Errors: 1, ErrorMessage: "one error"}
|
||||
b, _ := json.Marshal(orig)
|
||||
var got domain.ScrapeResult
|
||||
json.Unmarshal(b, &got)
|
||||
if got != orig {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioResult_JSONRoundtrip(t *testing.T) {
|
||||
orig := domain.AudioResult{ObjectKey: "audio/slug/1/af_bella.mp3"}
|
||||
b, _ := json.Marshal(orig)
|
||||
var got domain.AudioResult
|
||||
json.Unmarshal(b, &got)
|
||||
if got != orig {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskStatus_Values(t *testing.T) {
|
||||
cases := []domain.TaskStatus{
|
||||
domain.TaskStatusPending,
|
||||
domain.TaskStatusRunning,
|
||||
domain.TaskStatusDone,
|
||||
domain.TaskStatusFailed,
|
||||
domain.TaskStatusCancelled,
|
||||
}
|
||||
for _, s := range cases {
|
||||
if s == "" {
|
||||
t.Errorf("TaskStatus constant must not be empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
124
backend/internal/httputil/httputil.go
Normal file
124
backend/internal/httputil/httputil.go
Normal file
@@ -0,0 +1,124 @@
|
||||
// Package httputil provides shared HTTP helpers used by both the runner and
|
||||
// backend binaries. It has no imports from this module — only the standard
|
||||
// library — so it is safe to import from anywhere in the dependency graph.
|
||||
package httputil
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Client is the minimal interface for making HTTP GET requests.
|
||||
// *http.Client satisfies this interface.
|
||||
type Client interface {
|
||||
Do(req *http.Request) (*http.Response, error)
|
||||
}
|
||||
|
||||
// ErrMaxRetries is returned when RetryGet exhausts all attempts.
|
||||
var ErrMaxRetries = errors.New("httputil: max retries exceeded")
|
||||
|
||||
// errClientError is returned by doGet for 4xx responses; it signals that the
|
||||
// request should NOT be retried (the client is at fault).
|
||||
var errClientError = errors.New("httputil: client error")
|
||||
|
||||
// RetryGet fetches url using client, retrying on network errors or 5xx
|
||||
// responses with exponential backoff. It returns the full response body as a
|
||||
// string on success.
|
||||
//
|
||||
// - maxAttempts: total number of attempts (must be >= 1)
|
||||
// - baseDelay: initial wait before the second attempt; doubles each retry
|
||||
func RetryGet(ctx context.Context, client Client, url string, maxAttempts int, baseDelay time.Duration) (string, error) {
|
||||
if maxAttempts < 1 {
|
||||
maxAttempts = 1
|
||||
}
|
||||
delay := baseDelay
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxAttempts; attempt++ {
|
||||
if attempt > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case <-time.After(delay):
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
|
||||
body, err := doGet(ctx, client, url)
|
||||
if err == nil {
|
||||
return body, nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// Do not retry on context cancellation.
|
||||
if ctx.Err() != nil {
|
||||
return "", ctx.Err()
|
||||
}
|
||||
// Do not retry on 4xx — the client is at fault.
|
||||
if errors.Is(err, errClientError) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("%w after %d attempts: %w", ErrMaxRetries, maxAttempts, lastErr)
|
||||
}
|
||||
|
||||
func doGet(ctx context.Context, client Client, url string) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; libnovel-runner/2)")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("GET %s: %w", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 500 {
|
||||
return "", fmt.Errorf("GET %s: server error %d", url, resp.StatusCode)
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
return "", fmt.Errorf("%w: GET %s: client error %d", errClientError, url, resp.StatusCode)
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read body %s: %w", url, err)
|
||||
}
|
||||
return string(raw), nil
|
||||
}
|
||||
|
||||
// WriteJSON writes v as JSON to w with the given HTTP status code and sets the
|
||||
// Content-Type header to application/json.
|
||||
func WriteJSON(w http.ResponseWriter, status int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
_ = json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
|
||||
// WriteError writes a JSON error object {"error": msg} with the given status.
|
||||
func WriteError(w http.ResponseWriter, status int, msg string) {
|
||||
WriteJSON(w, status, map[string]string{"error": msg})
|
||||
}
|
||||
|
||||
// maxBodyBytes is the limit applied by DecodeJSON to prevent unbounded reads.
|
||||
const maxBodyBytes = 1 << 20 // 1 MiB
|
||||
|
||||
// DecodeJSON decodes a JSON request body into v. It enforces a 1 MiB size
|
||||
// limit and returns a descriptive error on any failure.
|
||||
func DecodeJSON(r *http.Request, v any) error {
|
||||
r.Body = http.MaxBytesReader(nil, r.Body, maxBodyBytes)
|
||||
dec := json.NewDecoder(r.Body)
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(v); err != nil {
|
||||
return fmt.Errorf("decode JSON body: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
181
backend/internal/httputil/httputil_test.go
Normal file
181
backend/internal/httputil/httputil_test.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package httputil_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/httputil"
|
||||
)
|
||||
|
||||
// ── RetryGet ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestRetryGet_ImmediateSuccess(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Write([]byte("hello"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
body, err := httputil.RetryGet(context.Background(), srv.Client(), srv.URL, 3, time.Millisecond)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if body != "hello" {
|
||||
t.Errorf("want hello, got %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_RetriesOn5xx(t *testing.T) {
|
||||
calls := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
calls++
|
||||
if calls < 3 {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
w.Write([]byte("ok"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
body, err := httputil.RetryGet(context.Background(), srv.Client(), srv.URL, 5, time.Millisecond)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if body != "ok" {
|
||||
t.Errorf("want ok, got %q", body)
|
||||
}
|
||||
if calls != 3 {
|
||||
t.Errorf("want 3 calls, got %d", calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_MaxAttemptsExceeded(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
_, err := httputil.RetryGet(context.Background(), srv.Client(), srv.URL, 3, time.Millisecond)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_ContextCancelDuringBackoff(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Cancel after first failed attempt hits the backoff wait.
|
||||
go func() { time.Sleep(5 * time.Millisecond); cancel() }()
|
||||
|
||||
_, err := httputil.RetryGet(ctx, srv.Client(), srv.URL, 10, 500*time.Millisecond)
|
||||
if err == nil {
|
||||
t.Fatal("expected context cancellation error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_NoRetryOn4xx(t *testing.T) {
|
||||
calls := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
calls++
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
_, err := httputil.RetryGet(context.Background(), srv.Client(), srv.URL, 5, time.Millisecond)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for 404")
|
||||
}
|
||||
// 4xx is NOT retried — should be exactly 1 call.
|
||||
if calls != 1 {
|
||||
t.Errorf("want 1 call for 4xx, got %d", calls)
|
||||
}
|
||||
}
|
||||
|
||||
// ── WriteJSON ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestWriteJSON_SetsHeadersAndStatus(t *testing.T) {
|
||||
rr := httptest.NewRecorder()
|
||||
httputil.WriteJSON(rr, http.StatusCreated, map[string]string{"key": "val"})
|
||||
|
||||
if rr.Code != http.StatusCreated {
|
||||
t.Errorf("status: want 201, got %d", rr.Code)
|
||||
}
|
||||
if ct := rr.Header().Get("Content-Type"); ct != "application/json" {
|
||||
t.Errorf("Content-Type: want application/json, got %q", ct)
|
||||
}
|
||||
var got map[string]string
|
||||
if err := json.NewDecoder(rr.Body).Decode(&got); err != nil {
|
||||
t.Fatalf("decode body: %v", err)
|
||||
}
|
||||
if got["key"] != "val" {
|
||||
t.Errorf("body key: want val, got %q", got["key"])
|
||||
}
|
||||
}
|
||||
|
||||
// ── WriteError ────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestWriteError_Format(t *testing.T) {
|
||||
rr := httptest.NewRecorder()
|
||||
httputil.WriteError(rr, http.StatusBadRequest, "bad input")
|
||||
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Errorf("status: want 400, got %d", rr.Code)
|
||||
}
|
||||
var got map[string]string
|
||||
json.NewDecoder(rr.Body).Decode(&got)
|
||||
if got["error"] != "bad input" {
|
||||
t.Errorf("error field: want bad input, got %q", got["error"])
|
||||
}
|
||||
}
|
||||
|
||||
// ── DecodeJSON ────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestDecodeJSON_HappyPath(t *testing.T) {
|
||||
body := `{"name":"test","value":42}`
|
||||
req := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
var payload struct {
|
||||
Name string `json:"name"`
|
||||
Value int `json:"value"`
|
||||
}
|
||||
if err := httputil.DecodeJSON(req, &payload); err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if payload.Name != "test" || payload.Value != 42 {
|
||||
t.Errorf("unexpected payload: %+v", payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeJSON_UnknownFieldReturnsError(t *testing.T) {
|
||||
body := `{"name":"test","unknown_field":"boom"}`
|
||||
req := httptest.NewRequest(http.MethodPost, "/", strings.NewReader(body))
|
||||
|
||||
var payload struct {
|
||||
Name string `json:"name"`
|
||||
}
|
||||
if err := httputil.DecodeJSON(req, &payload); err == nil {
|
||||
t.Fatal("expected error for unknown field, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeJSON_BodyTooLarge(t *testing.T) {
|
||||
// Build a body > 1 MiB.
|
||||
big := bytes.Repeat([]byte("a"), 2<<20)
|
||||
req := httptest.NewRequest(http.MethodPost, "/", bytes.NewReader(big))
|
||||
|
||||
var payload map[string]any
|
||||
if err := httputil.DecodeJSON(req, &payload); err == nil {
|
||||
t.Fatal("expected error for oversized body, got nil")
|
||||
}
|
||||
}
|
||||
160
backend/internal/kokoro/client.go
Normal file
160
backend/internal/kokoro/client.go
Normal file
@@ -0,0 +1,160 @@
|
||||
// Package kokoro provides a client for the Kokoro-FastAPI TTS service.
|
||||
//
|
||||
// The Kokoro API is an OpenAI-compatible audio speech API that returns a
|
||||
// download link (X-Download-Path header) instead of streaming audio directly.
|
||||
// GenerateAudio handles the two-step flow: POST /v1/audio/speech → GET /v1/download/{file}.
|
||||
package kokoro
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Client is the interface for interacting with the Kokoro TTS service.
|
||||
type Client interface {
|
||||
// GenerateAudio synthesises text using voice and returns raw MP3 bytes.
|
||||
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
|
||||
|
||||
// ListVoices returns the available voice IDs. Falls back to an empty slice
|
||||
// on error — callers should treat an empty list as "service unavailable".
|
||||
ListVoices(ctx context.Context) ([]string, error)
|
||||
}
|
||||
|
||||
// httpClient is the concrete Kokoro HTTP client.
|
||||
type httpClient struct {
|
||||
baseURL string
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
// New returns a Kokoro Client targeting baseURL (e.g. "https://kokoro.example.com").
|
||||
func New(baseURL string) Client {
|
||||
return &httpClient{
|
||||
baseURL: strings.TrimRight(baseURL, "/"),
|
||||
http: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
}
|
||||
|
||||
// GenerateAudio calls POST /v1/audio/speech (return_download_link=true) and then
|
||||
// downloads the resulting MP3 from GET /v1/download/{filename}.
|
||||
func (c *httpClient) GenerateAudio(ctx context.Context, text, voice string) ([]byte, error) {
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("kokoro: empty text")
|
||||
}
|
||||
if voice == "" {
|
||||
voice = "af_bella"
|
||||
}
|
||||
|
||||
// ── Step 1: request generation ────────────────────────────────────────────
|
||||
reqBody, err := json.Marshal(map[string]any{
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": voice,
|
||||
"response_format": "mp3",
|
||||
"speed": 1.0,
|
||||
"stream": false,
|
||||
"return_download_link": true,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
c.baseURL+"/v1/audio/speech", bytes.NewReader(reqBody))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: build speech request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: speech request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("kokoro: speech returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
dlPath := resp.Header.Get("X-Download-Path")
|
||||
if dlPath == "" {
|
||||
return nil, fmt.Errorf("kokoro: no X-Download-Path header in response")
|
||||
}
|
||||
filename := dlPath
|
||||
if idx := strings.LastIndex(dlPath, "/"); idx >= 0 {
|
||||
filename = dlPath[idx+1:]
|
||||
}
|
||||
if filename == "" {
|
||||
return nil, fmt.Errorf("kokoro: empty filename in X-Download-Path: %q", dlPath)
|
||||
}
|
||||
|
||||
// ── Step 2: download the generated file ───────────────────────────────────
|
||||
dlURL := c.baseURL + "/v1/download/" + filename
|
||||
dlReq, err := http.NewRequestWithContext(ctx, http.MethodGet, dlURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: build download request: %w", err)
|
||||
}
|
||||
|
||||
dlResp, err := c.http.Do(dlReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: download request: %w", err)
|
||||
}
|
||||
defer dlResp.Body.Close()
|
||||
|
||||
if dlResp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("kokoro: download returned %d", dlResp.StatusCode)
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(dlResp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: read download body: %w", err)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// ListVoices calls GET /v1/audio/voices and returns the list of voice IDs.
|
||||
func (c *httpClient) ListVoices(ctx context.Context) ([]string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
|
||||
c.baseURL+"/v1/audio/voices", nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: build voices request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("kokoro: voices request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
return nil, fmt.Errorf("kokoro: voices returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Voices []string `json:"voices"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return nil, fmt.Errorf("kokoro: decode voices response: %w", err)
|
||||
}
|
||||
return result.Voices, nil
|
||||
}
|
||||
|
||||
// VoiceSampleKey returns the MinIO object key for a voice sample MP3.
|
||||
// Key: _voice-samples/{voice}.mp3 (sanitised).
|
||||
func VoiceSampleKey(voice string) string {
|
||||
safe := strings.Map(func(r rune) rune {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
|
||||
(r >= '0' && r <= '9') || r == '_' || r == '-' {
|
||||
return r
|
||||
}
|
||||
return '_'
|
||||
}, voice)
|
||||
return fmt.Sprintf("_voice-samples/%s.mp3", safe)
|
||||
}
|
||||
291
backend/internal/kokoro/client_test.go
Normal file
291
backend/internal/kokoro/client_test.go
Normal file
@@ -0,0 +1,291 @@
|
||||
package kokoro_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
)
|
||||
|
||||
// ── VoiceSampleKey ────────────────────────────────────────────────────────────
|
||||
|
||||
func TestVoiceSampleKey(t *testing.T) {
|
||||
tests := []struct {
|
||||
voice string
|
||||
want string
|
||||
}{
|
||||
{"af_bella", "_voice-samples/af_bella.mp3"},
|
||||
{"am_echo", "_voice-samples/am_echo.mp3"},
|
||||
{"voice with spaces", "_voice-samples/voice_with_spaces.mp3"},
|
||||
{"special!@#chars", "_voice-samples/special___chars.mp3"},
|
||||
{"", "_voice-samples/.mp3"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.voice, func(t *testing.T) {
|
||||
got := kokoro.VoiceSampleKey(tt.voice)
|
||||
if got != tt.want {
|
||||
t.Errorf("VoiceSampleKey(%q) = %q, want %q", tt.voice, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── GenerateAudio ─────────────────────────────────────────────────────────────
|
||||
|
||||
func TestGenerateAudio_EmptyText(t *testing.T) {
|
||||
srv := httptest.NewServer(http.NotFoundHandler())
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.GenerateAudio(context.Background(), "", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for empty text, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "empty text") {
|
||||
t.Errorf("expected 'empty text' in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_DefaultVoice(t *testing.T) {
|
||||
// Tracks that the voice defaults to af_bella when empty.
|
||||
var capturedBody string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/speech" {
|
||||
buf := make([]byte, 512)
|
||||
n, _ := r.Body.Read(buf)
|
||||
capturedBody = string(buf[:n])
|
||||
w.Header().Set("X-Download-Path", "/download/test_file.mp3")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
if strings.HasPrefix(r.URL.Path, "/v1/download/") {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte("fake-mp3-data"))
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
data, err := c.GenerateAudio(context.Background(), "hello world", "")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if string(data) != "fake-mp3-data" {
|
||||
t.Errorf("unexpected data: %q", string(data))
|
||||
}
|
||||
if !strings.Contains(capturedBody, `"af_bella"`) {
|
||||
t.Errorf("expected default voice af_bella in request body, got: %s", capturedBody)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_SpeechNon200(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/speech" {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.GenerateAudio(context.Background(), "text", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-200 speech response")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "500") {
|
||||
t.Errorf("expected 500 in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_NoDownloadPathHeader(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/speech" {
|
||||
// No X-Download-Path header
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.GenerateAudio(context.Background(), "text", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for missing X-Download-Path")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "X-Download-Path") {
|
||||
t.Errorf("expected X-Download-Path in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_DownloadFails(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/speech" {
|
||||
w.Header().Set("X-Download-Path", "/v1/download/speech.mp3")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
if strings.HasPrefix(r.URL.Path, "/v1/download/") {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.GenerateAudio(context.Background(), "text", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for failed download")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "404") {
|
||||
t.Errorf("expected 404 in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_FullPath(t *testing.T) {
|
||||
// X-Download-Path with a full path: extract just filename.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/speech" {
|
||||
w.Header().Set("X-Download-Path", "/some/nested/path/audio_abc123.mp3")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
if r.URL.Path == "/v1/download/audio_abc123.mp3" {
|
||||
_, _ = w.Write([]byte("audio-bytes"))
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
data, err := c.GenerateAudio(context.Background(), "text", "af_bella")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if string(data) != "audio-bytes" {
|
||||
t.Errorf("unexpected data: %q", string(data))
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateAudio_ContextCancelled(t *testing.T) {
|
||||
// Server that hangs — context should cancel before we get a response.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Never respond.
|
||||
select {}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel immediately
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.GenerateAudio(ctx, "text", "af_bella")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for cancelled context")
|
||||
}
|
||||
}
|
||||
|
||||
// ── ListVoices ────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestListVoices_Success(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/voices" {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"voices":["af_bella","am_adam","bf_emma"]}`))
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
voices, err := c.ListVoices(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(voices) != 3 {
|
||||
t.Errorf("expected 3 voices, got %d: %v", len(voices), voices)
|
||||
}
|
||||
if voices[0] != "af_bella" {
|
||||
t.Errorf("expected first voice to be af_bella, got %q", voices[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestListVoices_Non200(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.ListVoices(context.Background())
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-200 response")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "503") {
|
||||
t.Errorf("expected 503 in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListVoices_MalformedJSON(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`not-json`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
_, err := c.ListVoices(context.Background())
|
||||
if err == nil {
|
||||
t.Fatal("expected error for malformed JSON")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListVoices_EmptyVoices(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"voices":[]}`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL)
|
||||
voices, err := c.ListVoices(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(voices) != 0 {
|
||||
t.Errorf("expected 0 voices, got %d", len(voices))
|
||||
}
|
||||
}
|
||||
|
||||
// ── New ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestNew_TrailingSlashStripped(t *testing.T) {
|
||||
// Verify that a trailing slash on baseURL doesn't produce double-slash paths.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/v1/audio/voices" {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"voices":["af_bella"]}`))
|
||||
return
|
||||
}
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c := kokoro.New(srv.URL + "/") // trailing slash
|
||||
voices, err := c.ListVoices(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(voices) == 0 {
|
||||
t.Error("expected at least one voice")
|
||||
}
|
||||
}
|
||||
327
backend/internal/meili/client.go
Normal file
327
backend/internal/meili/client.go
Normal file
@@ -0,0 +1,327 @@
|
||||
// Package meili provides a thin Meilisearch client for indexing and searching
|
||||
// locally scraped books.
|
||||
//
|
||||
// Index:
|
||||
// - Name: "books"
|
||||
// - Primary key: "slug"
|
||||
// - Searchable attributes: title, author, genres, summary
|
||||
// - Filterable attributes: status, genres
|
||||
// - Sortable attributes: rank, rating, total_chapters, meta_updated
|
||||
//
|
||||
// The client is intentionally simple: UpsertBook and Search only. All
|
||||
// Meilisearch-specific details (index management, attribute configuration)
|
||||
// are handled once in Configure(), called at startup.
|
||||
package meili
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/meilisearch/meilisearch-go"
|
||||
)
|
||||
|
||||
const indexName = "books"
|
||||
|
||||
// Client is the interface for Meilisearch operations used by runner and backend.
|
||||
type Client interface {
|
||||
// UpsertBook adds or updates a book document in the search index.
|
||||
UpsertBook(ctx context.Context, book domain.BookMeta) error
|
||||
// BookExists reports whether a book with the given slug is already in the
|
||||
// index. Used by the catalogue refresh to skip re-indexing known books.
|
||||
BookExists(ctx context.Context, slug string) bool
|
||||
// Search returns up to limit books matching query.
|
||||
Search(ctx context.Context, query string, limit int) ([]domain.BookMeta, error)
|
||||
// Catalogue queries books with optional filters, sort, and pagination.
|
||||
// Returns books, the total hit count for pagination, and a FacetResult
|
||||
// with available genre and status values from the index.
|
||||
Catalogue(ctx context.Context, q CatalogueQuery) ([]domain.BookMeta, int64, FacetResult, error)
|
||||
}
|
||||
|
||||
// CatalogueQuery holds parameters for the /api/catalogue endpoint.
|
||||
type CatalogueQuery struct {
|
||||
Q string // full-text query (may be empty for browse)
|
||||
Genre string // genre filter, e.g. "fantasy" or "all"
|
||||
Status string // status filter, e.g. "ongoing", "completed", or "all"
|
||||
Sort string // sort field: "popular", "new", "update", "top-rated", "rank", ""
|
||||
Page int // 1-indexed
|
||||
Limit int // items per page, default 20
|
||||
}
|
||||
|
||||
// FacetResult holds the available filter values discovered from the index.
|
||||
// Values are sorted alphabetically and include only those present in the index.
|
||||
type FacetResult struct {
|
||||
Genres []string // distinct genre values
|
||||
Statuses []string // distinct status values
|
||||
}
|
||||
|
||||
// MeiliClient wraps the meilisearch-go SDK.
|
||||
type MeiliClient struct {
|
||||
idx meilisearch.IndexManager
|
||||
}
|
||||
|
||||
// New creates a MeiliClient. Call Configure() once at startup to ensure the
|
||||
// index exists and has the correct attribute settings.
|
||||
func New(host, apiKey string) *MeiliClient {
|
||||
cli := meilisearch.New(host, meilisearch.WithAPIKey(apiKey))
|
||||
return &MeiliClient{idx: cli.Index(indexName)}
|
||||
}
|
||||
|
||||
// Configure creates the index if absent and sets searchable/filterable
|
||||
// attributes. It is idempotent — safe to call on every startup.
|
||||
func Configure(host, apiKey string) error {
|
||||
cli := meilisearch.New(host, meilisearch.WithAPIKey(apiKey))
|
||||
|
||||
// Create index with primary key. Returns 202 if exists — ignore.
|
||||
task, err := cli.CreateIndex(&meilisearch.IndexConfig{
|
||||
Uid: indexName,
|
||||
PrimaryKey: "slug",
|
||||
})
|
||||
if err != nil {
|
||||
// 400 "index_already_exists" is not an error here; the SDK returns
|
||||
// an error with Code "index_already_exists" which we can ignore.
|
||||
// Any other error is fatal.
|
||||
if apiErr, ok := err.(*meilisearch.Error); ok && apiErr.MeilisearchApiError.Code == "index_already_exists" {
|
||||
// already exists — continue
|
||||
} else {
|
||||
return fmt.Errorf("meili: create index: %w", err)
|
||||
}
|
||||
} else {
|
||||
_ = task // task is async; we don't wait for it
|
||||
}
|
||||
|
||||
idx := cli.Index(indexName)
|
||||
|
||||
searchable := []string{"title", "author", "genres", "summary"}
|
||||
if _, err := idx.UpdateSearchableAttributes(&searchable); err != nil {
|
||||
return fmt.Errorf("meili: update searchable attributes: %w", err)
|
||||
}
|
||||
|
||||
filterable := []interface{}{"status", "genres"}
|
||||
if _, err := idx.UpdateFilterableAttributes(&filterable); err != nil {
|
||||
return fmt.Errorf("meili: update filterable attributes: %w", err)
|
||||
}
|
||||
|
||||
sortable := []string{"rank", "rating", "total_chapters", "meta_updated"}
|
||||
if _, err := idx.UpdateSortableAttributes(&sortable); err != nil {
|
||||
return fmt.Errorf("meili: update sortable attributes: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// bookDoc is the Meilisearch document shape for a book.
|
||||
type bookDoc struct {
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
Cover string `json:"cover"`
|
||||
Status string `json:"status"`
|
||||
Genres []string `json:"genres"`
|
||||
Summary string `json:"summary"`
|
||||
TotalChapters int `json:"total_chapters"`
|
||||
SourceURL string `json:"source_url"`
|
||||
Rank int `json:"rank"`
|
||||
Rating float64 `json:"rating"`
|
||||
// MetaUpdated is the Unix timestamp (seconds) of the last PocketBase update.
|
||||
// Used for sort=update ("recently updated" ordering).
|
||||
MetaUpdated int64 `json:"meta_updated"`
|
||||
}
|
||||
|
||||
func toDoc(b domain.BookMeta) bookDoc {
|
||||
return bookDoc{
|
||||
Slug: b.Slug,
|
||||
Title: b.Title,
|
||||
Author: b.Author,
|
||||
Cover: b.Cover,
|
||||
Status: b.Status,
|
||||
Genres: b.Genres,
|
||||
Summary: b.Summary,
|
||||
TotalChapters: b.TotalChapters,
|
||||
SourceURL: b.SourceURL,
|
||||
Rank: b.Ranking,
|
||||
Rating: b.Rating,
|
||||
MetaUpdated: b.MetaUpdated,
|
||||
}
|
||||
}
|
||||
|
||||
func fromDoc(d bookDoc) domain.BookMeta {
|
||||
return domain.BookMeta{
|
||||
Slug: d.Slug,
|
||||
Title: d.Title,
|
||||
Author: d.Author,
|
||||
Cover: d.Cover,
|
||||
Status: d.Status,
|
||||
Genres: d.Genres,
|
||||
Summary: d.Summary,
|
||||
TotalChapters: d.TotalChapters,
|
||||
SourceURL: d.SourceURL,
|
||||
Ranking: d.Rank,
|
||||
Rating: d.Rating,
|
||||
MetaUpdated: d.MetaUpdated,
|
||||
}
|
||||
}
|
||||
|
||||
// UpsertBook adds or replaces the book document in Meilisearch. The operation
|
||||
// is fire-and-forget (Meilisearch processes tasks asynchronously).
|
||||
func (c *MeiliClient) UpsertBook(_ context.Context, book domain.BookMeta) error {
|
||||
docs := []bookDoc{toDoc(book)}
|
||||
pk := "slug"
|
||||
if _, err := c.idx.AddDocuments(docs, &meilisearch.DocumentOptions{PrimaryKey: &pk}); err != nil {
|
||||
return fmt.Errorf("meili: upsert book %q: %w", book.Slug, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// BookExists reports whether the slug is already present in the index.
|
||||
// It fetches the document by primary key; a 404 or any error is treated as
|
||||
// "not present" (safe default: re-index rather than silently skip).
|
||||
func (c *MeiliClient) BookExists(_ context.Context, slug string) bool {
|
||||
var doc bookDoc
|
||||
err := c.idx.GetDocument(slug, nil, &doc)
|
||||
return err == nil && doc.Slug != ""
|
||||
}
|
||||
|
||||
// Search returns books matching query, up to limit results.
|
||||
func (c *MeiliClient) Search(_ context.Context, query string, limit int) ([]domain.BookMeta, error) {
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
res, err := c.idx.Search(query, &meilisearch.SearchRequest{
|
||||
Limit: int64(limit),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("meili: search %q: %w", query, err)
|
||||
}
|
||||
|
||||
books := make([]domain.BookMeta, 0, len(res.Hits))
|
||||
for _, hit := range res.Hits {
|
||||
// Hit is map[string]json.RawMessage — unmarshal directly into bookDoc.
|
||||
var doc bookDoc
|
||||
raw, err := json.Marshal(hit)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
continue
|
||||
}
|
||||
books = append(books, fromDoc(doc))
|
||||
}
|
||||
return books, nil
|
||||
}
|
||||
|
||||
// Catalogue queries books with optional full-text search, genre/status filters,
|
||||
// sort order, and pagination. Returns matching books, the total estimate, and
|
||||
// a FacetResult containing available genre and status values from the index.
|
||||
func (c *MeiliClient) Catalogue(_ context.Context, q CatalogueQuery) ([]domain.BookMeta, int64, FacetResult, error) {
|
||||
if q.Limit <= 0 {
|
||||
q.Limit = 20
|
||||
}
|
||||
if q.Page <= 0 {
|
||||
q.Page = 1
|
||||
}
|
||||
|
||||
req := &meilisearch.SearchRequest{
|
||||
Limit: int64(q.Limit),
|
||||
Offset: int64((q.Page - 1) * q.Limit),
|
||||
// Request facet distribution so the UI can build filter options
|
||||
// dynamically without hardcoding genre/status lists.
|
||||
Facets: []string{"genres", "status"},
|
||||
}
|
||||
|
||||
// Build filter
|
||||
var filters []string
|
||||
if q.Genre != "" && q.Genre != "all" {
|
||||
filters = append(filters, fmt.Sprintf("genres = %q", q.Genre))
|
||||
}
|
||||
if q.Status != "" && q.Status != "all" {
|
||||
filters = append(filters, fmt.Sprintf("status = %q", q.Status))
|
||||
}
|
||||
if len(filters) > 0 {
|
||||
req.Filter = strings.Join(filters, " AND ")
|
||||
}
|
||||
|
||||
// Map UI sort tokens to Meilisearch sort expressions.
|
||||
switch q.Sort {
|
||||
case "rank":
|
||||
req.Sort = []string{"rank:asc"}
|
||||
case "top-rated":
|
||||
req.Sort = []string{"rating:desc"}
|
||||
case "new":
|
||||
req.Sort = []string{"total_chapters:desc"}
|
||||
case "update":
|
||||
req.Sort = []string{"meta_updated:desc"}
|
||||
// "popular" and "" → relevance (no explicit sort)
|
||||
}
|
||||
|
||||
res, err := c.idx.Search(q.Q, req)
|
||||
if err != nil {
|
||||
return nil, 0, FacetResult{}, fmt.Errorf("meili: catalogue query: %w", err)
|
||||
}
|
||||
|
||||
books := make([]domain.BookMeta, 0, len(res.Hits))
|
||||
for _, hit := range res.Hits {
|
||||
var doc bookDoc
|
||||
raw, err := json.Marshal(hit)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
continue
|
||||
}
|
||||
books = append(books, fromDoc(doc))
|
||||
}
|
||||
|
||||
facets := parseFacets(res.FacetDistribution)
|
||||
return books, res.EstimatedTotalHits, facets, nil
|
||||
}
|
||||
|
||||
// parseFacets extracts sorted genre and status slices from a Meilisearch
|
||||
// facetDistribution raw JSON value.
|
||||
// The JSON shape is: {"genres":{"fantasy":12,"action":5},"status":{"ongoing":7}}
|
||||
func parseFacets(raw json.RawMessage) FacetResult {
|
||||
var result FacetResult
|
||||
if len(raw) == 0 {
|
||||
return result
|
||||
}
|
||||
var dist map[string]map[string]int64
|
||||
if err := json.Unmarshal(raw, &dist); err != nil {
|
||||
return result
|
||||
}
|
||||
if m, ok := dist["genres"]; ok {
|
||||
for k := range m {
|
||||
result.Genres = append(result.Genres, k)
|
||||
}
|
||||
sortStrings(result.Genres)
|
||||
}
|
||||
if m, ok := dist["status"]; ok {
|
||||
for k := range m {
|
||||
result.Statuses = append(result.Statuses, k)
|
||||
}
|
||||
sortStrings(result.Statuses)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// sortStrings sorts a slice of strings in place.
|
||||
func sortStrings(s []string) {
|
||||
for i := 1; i < len(s); i++ {
|
||||
for j := i; j > 0 && s[j] < s[j-1]; j-- {
|
||||
s[j], s[j-1] = s[j-1], s[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NoopClient is a no-op Client used when Meilisearch is not configured.
|
||||
type NoopClient struct{}
|
||||
|
||||
func (NoopClient) UpsertBook(_ context.Context, _ domain.BookMeta) error { return nil }
|
||||
func (NoopClient) BookExists(_ context.Context, _ string) bool { return false }
|
||||
func (NoopClient) Search(_ context.Context, _ string, _ int) ([]domain.BookMeta, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (NoopClient) Catalogue(_ context.Context, _ CatalogueQuery) ([]domain.BookMeta, int64, FacetResult, error) {
|
||||
return nil, 0, FacetResult{}, nil
|
||||
}
|
||||
@@ -3,13 +3,31 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// ResolveURL returns an absolute URL. If href is already absolute it is
|
||||
// returned unchanged. Otherwise it is resolved against base.
|
||||
func ResolveURL(base, href string) string {
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
b, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return base + href
|
||||
}
|
||||
ref, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return base + href
|
||||
}
|
||||
return b.ResolveReference(ref).String()
|
||||
}
|
||||
|
||||
// ParseHTML parses raw HTML and returns the root node.
|
||||
func ParseHTML(raw string) (*html.Node, error) {
|
||||
return html.Parse(strings.NewReader(raw))
|
||||
@@ -48,8 +66,8 @@ matched:
|
||||
return true
|
||||
}
|
||||
|
||||
// attrVal returns the value of attribute key from node n.
|
||||
func attrVal(n *html.Node, key string) string {
|
||||
// AttrVal returns the value of attribute key from node n.
|
||||
func AttrVal(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
@@ -58,8 +76,8 @@ func attrVal(n *html.Node, key string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// textContent returns the concatenated text content of all descendant text nodes.
|
||||
func textContent(n *html.Node) string {
|
||||
// TextContent returns the concatenated text content of all descendant text nodes.
|
||||
func TextContent(n *html.Node) string {
|
||||
var sb strings.Builder
|
||||
var walk func(*html.Node)
|
||||
walk = func(cur *html.Node) {
|
||||
@@ -114,9 +132,9 @@ func FindAll(root *html.Node, sel scraper.Selector) []*html.Node {
|
||||
// If sel.Attr is set the attribute value is returned; otherwise the inner text.
|
||||
func ExtractText(n *html.Node, sel scraper.Selector) string {
|
||||
if sel.Attr != "" {
|
||||
return attrVal(n, sel.Attr)
|
||||
return AttrVal(n, sel.Attr)
|
||||
}
|
||||
return textContent(n)
|
||||
return TextContent(n)
|
||||
}
|
||||
|
||||
// ExtractFirst locates the first match in root and returns its text/attr value.
|
||||
@@ -140,29 +158,15 @@ func ExtractAll(root *html.Node, sel scraper.Selector) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// InnerHTML returns the serialized inner HTML of node n.
|
||||
func InnerHTML(n *html.Node) string {
|
||||
var sb strings.Builder
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
_ = html.Render(&sb, c)
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// NodeToMarkdown converts the children of an HTML node to a plain-text/Markdown
|
||||
// representation suitable for chapter storage. Block elements become newlines;
|
||||
// inline elements are inlined. Runs of more than one blank line are collapsed
|
||||
// to a single blank line.
|
||||
// representation suitable for chapter storage.
|
||||
func NodeToMarkdown(n *html.Node) string {
|
||||
var sb strings.Builder
|
||||
nodeToMD(n, &sb)
|
||||
// Collapse 3+ consecutive newlines (i.e. more than one blank line) to 2.
|
||||
out := multiBlankLine.ReplaceAllString(sb.String(), "\n\n")
|
||||
return strings.TrimSpace(out)
|
||||
}
|
||||
|
||||
// multiBlankLine matches three or more consecutive newline characters
|
||||
// (any mix of \n and surrounding whitespace-only lines).
|
||||
var multiBlankLine = regexp.MustCompile(`\n(\s*\n){2,}`)
|
||||
|
||||
var blockElements = map[string]bool{
|
||||
509
backend/internal/novelfire/scraper.go
Normal file
509
backend/internal/novelfire/scraper.go
Normal file
@@ -0,0 +1,509 @@
|
||||
// Package novelfire provides a NovelScraper implementation for novelfire.net.
|
||||
//
|
||||
// Site structure (as of 2025):
|
||||
//
|
||||
// Catalogue : https://novelfire.net/genre-all/sort-new/status-all/all-novel?page=N
|
||||
// Book page : https://novelfire.net/book/{slug}
|
||||
// Chapters : https://novelfire.net/book/{slug}/chapters?page=N
|
||||
// Chapter : https://novelfire.net/book/{slug}/{chapter-slug}
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/browser"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/novelfire/htmlutil"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const (
|
||||
baseURL = "https://novelfire.net"
|
||||
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
|
||||
rankingPath = "/genre-all/sort-popular/status-all/all-novel"
|
||||
)
|
||||
|
||||
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
|
||||
type Scraper struct {
|
||||
client browser.Client
|
||||
log *slog.Logger
|
||||
}
|
||||
|
||||
// Compile-time interface check.
|
||||
var _ scraper.NovelScraper = (*Scraper)(nil)
|
||||
|
||||
// New returns a new novelfire Scraper backed by client.
|
||||
func New(client browser.Client, log *slog.Logger) *Scraper {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &Scraper{client: client, log: log}
|
||||
}
|
||||
|
||||
// SourceName implements NovelScraper.
|
||||
func (s *Scraper) SourceName() string { return "novelfire.net" }
|
||||
|
||||
// ── CatalogueProvider ─────────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeCatalogue streams all CatalogueEntry values across all catalogue pages.
|
||||
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan domain.CatalogueEntry, <-chan error) {
|
||||
entries := make(chan domain.CatalogueEntry, 64)
|
||||
errs := make(chan error, 16)
|
||||
|
||||
go func() {
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
pageURL := baseURL + cataloguePath
|
||||
page := 1
|
||||
|
||||
for pageURL != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
|
||||
raw, err := s.client.GetContent(ctx, pageURL)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("catalogue page %d parse: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "li", Class: "novel-item", Multiple: true})
|
||||
if len(cards) == 0 {
|
||||
s.log.Warn("no novel cards found, stopping pagination", "page", page)
|
||||
return
|
||||
}
|
||||
|
||||
for _, card := range cards {
|
||||
linkNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h4", Class: "novel-title"})
|
||||
|
||||
var title, href string
|
||||
if linkNode != nil {
|
||||
href = htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
}
|
||||
if titleNode != nil {
|
||||
title = strings.TrimSpace(htmlutil.ExtractText(titleNode, scraper.Selector{}))
|
||||
}
|
||||
if href == "" || title == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
bookURL := resolveURL(baseURL, href)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case entries <- domain.CatalogueEntry{Slug: slugFromURL(bookURL), Title: title, URL: bookURL}:
|
||||
}
|
||||
}
|
||||
|
||||
if !hasNextPageLink(root) {
|
||||
break
|
||||
}
|
||||
nextHref := ""
|
||||
for _, a := range htmlutil.FindAll(root, scraper.Selector{Tag: "a", Multiple: true}) {
|
||||
if htmlutil.AttrVal(a, "rel") == "next" {
|
||||
nextHref = htmlutil.AttrVal(a, "href")
|
||||
break
|
||||
}
|
||||
}
|
||||
if nextHref == "" {
|
||||
break
|
||||
}
|
||||
pageURL = resolveURL(baseURL, nextHref)
|
||||
page++
|
||||
}
|
||||
}()
|
||||
|
||||
return entries, errs
|
||||
}
|
||||
|
||||
// ── MetadataProvider ──────────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeMetadata fetches and parses book metadata from the book's landing page.
|
||||
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (domain.BookMeta, error) {
|
||||
s.log.Debug("metadata fetch starting", "url", bookURL)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, bookURL)
|
||||
if err != nil {
|
||||
return domain.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return domain.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
|
||||
}
|
||||
|
||||
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
|
||||
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
|
||||
|
||||
var cover string
|
||||
if fig := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); fig != nil {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "src"})
|
||||
if cover != "" && !strings.HasPrefix(cover, "http") {
|
||||
cover = baseURL + cover
|
||||
}
|
||||
}
|
||||
|
||||
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
|
||||
|
||||
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
|
||||
var genres []string
|
||||
if genresNode != nil {
|
||||
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
|
||||
}
|
||||
|
||||
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
|
||||
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
|
||||
totalChapters := parseChapterCount(totalStr)
|
||||
|
||||
slug := slugFromURL(bookURL)
|
||||
|
||||
meta := domain.BookMeta{
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Author: author,
|
||||
Cover: cover,
|
||||
Status: status,
|
||||
Genres: genres,
|
||||
Summary: summary,
|
||||
TotalChapters: totalChapters,
|
||||
SourceURL: bookURL,
|
||||
}
|
||||
s.log.Debug("metadata parsed", "slug", meta.Slug, "title", meta.Title)
|
||||
return meta, nil
|
||||
}
|
||||
|
||||
// ── ChapterListProvider ───────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeChapterList returns chapter references for a book, ordered ascending.
|
||||
// upTo > 0 stops pagination as soon as at least upTo chapter numbers have been
|
||||
// collected — use this for range scrapes so we don't paginate 100 pages just
|
||||
// to discover refs we'll never scrape. upTo == 0 fetches all pages.
|
||||
// Each page fetch uses retryGet with 429-aware exponential backoff.
|
||||
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string, upTo int) ([]domain.ChapterRef, error) {
|
||||
var refs []domain.ChapterRef
|
||||
baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
page := 1
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return refs, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page)
|
||||
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
|
||||
|
||||
raw, err := retryGet(ctx, s.log, s.client, pageURL, 9, 6*time.Second)
|
||||
if err != nil {
|
||||
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
|
||||
}
|
||||
|
||||
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
|
||||
if chapterList == nil {
|
||||
s.log.Debug("chapter list container not found, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
|
||||
if len(items) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
continue
|
||||
}
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
||||
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
|
||||
if href == "" {
|
||||
continue
|
||||
}
|
||||
chURL := resolveURL(baseURL, href)
|
||||
num := chapterNumberFromURL(chURL)
|
||||
if num <= 0 {
|
||||
num = len(refs) + 1
|
||||
s.log.Warn("chapter number not parseable from URL, falling back to position",
|
||||
"url", chURL, "position", num)
|
||||
}
|
||||
refs = append(refs, domain.ChapterRef{
|
||||
Number: num,
|
||||
Title: strings.TrimSpace(chTitle),
|
||||
URL: chURL,
|
||||
})
|
||||
}
|
||||
|
||||
// Early-stop: if we have seen at least upTo chapter numbers, we have
|
||||
// enough refs to cover the requested range — no need to paginate further.
|
||||
if upTo > 0 && len(refs) > 0 && refs[len(refs)-1].Number >= upTo {
|
||||
s.log.Debug("chapter list early-stop reached", "upTo", upTo, "collected", len(refs))
|
||||
break
|
||||
}
|
||||
|
||||
page++
|
||||
}
|
||||
|
||||
return refs, nil
|
||||
}
|
||||
|
||||
// ── ChapterTextProvider ───────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeChapterText fetches and parses a single chapter page.
|
||||
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref domain.ChapterRef) (domain.Chapter, error) {
|
||||
s.log.Debug("chapter text fetch starting", "chapter", ref.Number, "url", ref.URL)
|
||||
|
||||
raw, err := retryGet(ctx, s.log, s.client, ref.URL, 9, 6*time.Second)
|
||||
if err != nil {
|
||||
return domain.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return domain.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
|
||||
}
|
||||
|
||||
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
|
||||
if container == nil {
|
||||
return domain.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
|
||||
}
|
||||
|
||||
text := htmlutil.NodeToMarkdown(container)
|
||||
|
||||
s.log.Debug("chapter text parsed", "chapter", ref.Number, "text_bytes", len(text))
|
||||
|
||||
return domain.Chapter{Ref: ref, Text: text}, nil
|
||||
}
|
||||
|
||||
// ── RankingProvider ───────────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeRanking pages through up to maxPages pages of the popular-novels listing.
|
||||
// maxPages <= 0 means all pages. The caller decides whether to persist items.
|
||||
func (s *Scraper) ScrapeRanking(ctx context.Context, maxPages int) (<-chan domain.BookMeta, <-chan error) {
|
||||
entries := make(chan domain.BookMeta, 32)
|
||||
errs := make(chan error, 16)
|
||||
|
||||
go func() {
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
rank := 1
|
||||
|
||||
for page := 1; maxPages <= 0 || page <= maxPages; page++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := fmt.Sprintf("%s%s?page=%d", baseURL, rankingPath, page)
|
||||
s.log.Info("scraping popular ranking page", "page", page, "url", pageURL)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, pageURL)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("ranking page %d: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("ranking page %d parse: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "li", Class: "novel-item", Multiple: true})
|
||||
if len(cards) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
for _, card := range cards {
|
||||
linkNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
continue
|
||||
}
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
bookURL := resolveURL(baseURL, href)
|
||||
if bookURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(htmlutil.ExtractFirst(card, scraper.Selector{Tag: "h4", Class: "novel-title"}))
|
||||
if title == "" {
|
||||
title = strings.TrimSpace(htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "title"}))
|
||||
}
|
||||
if title == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var cover string
|
||||
if fig := htmlutil.FindFirst(card, scraper.Selector{Tag: "figure", Class: "novel-cover"}); fig != nil {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"})
|
||||
if cover == "" {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "src"})
|
||||
}
|
||||
if strings.HasPrefix(cover, "data:") {
|
||||
cover = ""
|
||||
}
|
||||
if cover != "" && !strings.HasPrefix(cover, "http") {
|
||||
cover = baseURL + cover
|
||||
}
|
||||
}
|
||||
|
||||
meta := domain.BookMeta{
|
||||
Slug: slugFromURL(bookURL),
|
||||
Title: title,
|
||||
Cover: cover,
|
||||
SourceURL: bookURL,
|
||||
Ranking: rank,
|
||||
}
|
||||
rank++
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case entries <- meta:
|
||||
}
|
||||
}
|
||||
|
||||
if !hasNextPageLink(root) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return entries, errs
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func resolveURL(base, href string) string { return htmlutil.ResolveURL(base, href) }
|
||||
|
||||
func hasNextPageLink(root *html.Node) bool {
|
||||
links := htmlutil.FindAll(root, scraper.Selector{Tag: "a", Multiple: true})
|
||||
for _, a := range links {
|
||||
for _, attr := range a.Attr {
|
||||
if attr.Key == "rel" && attr.Val == "next" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func slugFromURL(bookURL string) string {
|
||||
u, err := url.Parse(bookURL)
|
||||
if err != nil {
|
||||
return bookURL
|
||||
}
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) >= 2 && parts[0] == "book" {
|
||||
return parts[1]
|
||||
}
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseChapterCount(s string) int {
|
||||
s = strings.ReplaceAll(s, ",", "")
|
||||
fields := strings.Fields(s)
|
||||
if len(fields) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.Atoi(fields[0])
|
||||
return n
|
||||
}
|
||||
|
||||
func chapterNumberFromURL(chapterURL string) int {
|
||||
u, err := url.Parse(chapterURL)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
seg := path.Base(u.Path)
|
||||
seg = strings.TrimPrefix(seg, "chapter-")
|
||||
seg = strings.TrimPrefix(seg, "chap-")
|
||||
seg = strings.TrimPrefix(seg, "ch-")
|
||||
digits := strings.FieldsFunc(seg, func(r rune) bool {
|
||||
return r < '0' || r > '9'
|
||||
})
|
||||
if len(digits) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.Atoi(digits[0])
|
||||
return n
|
||||
}
|
||||
|
||||
// retryGet calls client.GetContent up to maxAttempts times with exponential backoff.
|
||||
// If the server returns 429 (ErrRateLimit), the suggested Retry-After delay is used
|
||||
// instead of the geometric backoff delay.
|
||||
func retryGet(
|
||||
ctx context.Context,
|
||||
log *slog.Logger,
|
||||
client browser.Client,
|
||||
pageURL string,
|
||||
maxAttempts int,
|
||||
baseDelay time.Duration,
|
||||
) (string, error) {
|
||||
var lastErr error
|
||||
delay := baseDelay
|
||||
for attempt := 1; attempt <= maxAttempts; attempt++ {
|
||||
raw, err := client.GetContent(ctx, pageURL)
|
||||
if err == nil {
|
||||
return raw, nil
|
||||
}
|
||||
lastErr = err
|
||||
if ctx.Err() != nil {
|
||||
return "", err
|
||||
}
|
||||
if attempt < maxAttempts {
|
||||
// If the server is rate-limiting us, honour its Retry-After delay.
|
||||
waitFor := delay
|
||||
var rlErr *browser.RateLimitError
|
||||
if errors.As(err, &rlErr) {
|
||||
waitFor = rlErr.RetryAfter
|
||||
if log != nil {
|
||||
log.Warn("rate limited, backing off",
|
||||
"url", pageURL, "attempt", attempt, "retry_in", waitFor)
|
||||
}
|
||||
} else {
|
||||
if log != nil {
|
||||
log.Warn("fetch failed, retrying",
|
||||
"url", pageURL, "attempt", attempt, "retry_in", delay, "err", err)
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case <-time.After(waitFor):
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", lastErr
|
||||
}
|
||||
129
backend/internal/novelfire/scraper_test.go
Normal file
129
backend/internal/novelfire/scraper_test.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSlugFromURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
url string
|
||||
want string
|
||||
}{
|
||||
{"https://novelfire.net/book/shadow-slave", "shadow-slave"},
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world", "a-dragon-against-the-whole-world"},
|
||||
{"https://novelfire.net/book/foo/chapter-1", "foo"},
|
||||
{"https://novelfire.net/", ""},
|
||||
{"not-a-url", "not-a-url"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := slugFromURL(c.url)
|
||||
if got != c.want {
|
||||
t.Errorf("slugFromURL(%q) = %q, want %q", c.url, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChapterNumberFromURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
url string
|
||||
want int
|
||||
}{
|
||||
{"https://novelfire.net/book/shadow-slave/chapter-42", 42},
|
||||
{"https://novelfire.net/book/shadow-slave/chapter-1000", 1000},
|
||||
{"https://novelfire.net/book/shadow-slave/chap-7", 7},
|
||||
{"https://novelfire.net/book/shadow-slave/ch-3", 3},
|
||||
{"https://novelfire.net/book/shadow-slave/42", 42},
|
||||
{"https://novelfire.net/book/shadow-slave/no-number-here", 0},
|
||||
{"not-a-url", 0},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := chapterNumberFromURL(c.url)
|
||||
if got != c.want {
|
||||
t.Errorf("chapterNumberFromURL(%q) = %d, want %d", c.url, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseChapterCount(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"123 Chapters", 123},
|
||||
{"1,234 Chapters", 1234},
|
||||
{"0", 0},
|
||||
{"", 0},
|
||||
{"500", 500},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := parseChapterCount(c.in)
|
||||
if got != c.want {
|
||||
t.Errorf("parseChapterCount(%q) = %d, want %d", c.in, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_ContextCancellation(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel immediately
|
||||
|
||||
stub := newStubClient()
|
||||
stub.setError("https://example.com/page", context.Canceled)
|
||||
|
||||
_, err := retryGet(ctx, nil, stub, "https://example.com/page", 3, 0)
|
||||
if err == nil {
|
||||
t.Fatal("expected error on cancelled context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryGet_EventualSuccess(t *testing.T) {
|
||||
stub := newStubClient()
|
||||
calls := 0
|
||||
stub.setFn("https://example.com/page", func() (string, error) {
|
||||
calls++
|
||||
if calls < 3 {
|
||||
return "", context.DeadlineExceeded
|
||||
}
|
||||
return "<html>ok</html>", nil
|
||||
})
|
||||
|
||||
got, err := retryGet(context.Background(), nil, stub, "https://example.com/page", 5, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != "<html>ok</html>" {
|
||||
t.Errorf("got %q, want html", got)
|
||||
}
|
||||
if calls != 3 {
|
||||
t.Errorf("expected 3 calls, got %d", calls)
|
||||
}
|
||||
}
|
||||
|
||||
// ── minimal stub client for tests ─────────────────────────────────────────────
|
||||
|
||||
type stubClient struct {
|
||||
errors map[string]error
|
||||
fns map[string]func() (string, error)
|
||||
}
|
||||
|
||||
func newStubClient() *stubClient {
|
||||
return &stubClient{
|
||||
errors: make(map[string]error),
|
||||
fns: make(map[string]func() (string, error)),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *stubClient) setError(u string, err error) { s.errors[u] = err }
|
||||
|
||||
func (s *stubClient) setFn(u string, fn func() (string, error)) { s.fns[u] = fn }
|
||||
|
||||
func (s *stubClient) GetContent(_ context.Context, pageURL string) (string, error) {
|
||||
if fn, ok := s.fns[pageURL]; ok {
|
||||
return fn()
|
||||
}
|
||||
if err, ok := s.errors[pageURL]; ok {
|
||||
return "", err
|
||||
}
|
||||
return "", context.DeadlineExceeded
|
||||
}
|
||||
222
backend/internal/orchestrator/orchestrator.go
Normal file
222
backend/internal/orchestrator/orchestrator.go
Normal file
@@ -0,0 +1,222 @@
|
||||
// Package orchestrator coordinates metadata extraction, chapter-list fetching,
|
||||
// and parallel chapter scraping for a single book.
|
||||
//
|
||||
// Design:
|
||||
// - RunBook scrapes one book (metadata + chapter list + chapter texts) end-to-end.
|
||||
// - N worker goroutines pull chapter refs from a shared queue and call ScrapeChapterText.
|
||||
// - The caller (runner poll loop) owns the outer task-claim / finish cycle.
|
||||
// - An optional PostMetadata hook (set in Config) is called after WriteMetadata
|
||||
// succeeds. The runner uses this to upsert books into Meilisearch.
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"runtime"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
)
|
||||
|
||||
// Config holds tunable parameters for the orchestrator.
|
||||
type Config struct {
|
||||
// Workers is the number of goroutines used to scrape chapters in parallel.
|
||||
// Defaults to runtime.NumCPU() when 0.
|
||||
Workers int
|
||||
// PostMetadata is an optional hook called with the scraped BookMeta after
|
||||
// WriteMetadata succeeds. Errors from the hook are logged but not fatal.
|
||||
// Used by the runner to index books in Meilisearch.
|
||||
PostMetadata func(ctx context.Context, meta domain.BookMeta)
|
||||
}
|
||||
|
||||
// Orchestrator runs a single-book scrape pipeline.
|
||||
type Orchestrator struct {
|
||||
novel scraper.NovelScraper
|
||||
store bookstore.BookWriter
|
||||
log *slog.Logger
|
||||
workers int
|
||||
postMetadata func(ctx context.Context, meta domain.BookMeta)
|
||||
}
|
||||
|
||||
// New returns a new Orchestrator.
|
||||
func New(cfg Config, novel scraper.NovelScraper, store bookstore.BookWriter, log *slog.Logger) *Orchestrator {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
workers := cfg.Workers
|
||||
if workers <= 0 {
|
||||
workers = runtime.NumCPU()
|
||||
}
|
||||
return &Orchestrator{
|
||||
novel: novel,
|
||||
store: store,
|
||||
log: log,
|
||||
workers: workers,
|
||||
postMetadata: cfg.PostMetadata,
|
||||
}
|
||||
}
|
||||
|
||||
// RunBook scrapes a single book described by task. It handles:
|
||||
// 1. Metadata scrape + write
|
||||
// 2. Chapter list scrape + write
|
||||
// 3. Parallel chapter text scrape + write (worker pool)
|
||||
//
|
||||
// Returns a ScrapeResult with counters. The result's ErrorMessage is non-empty
|
||||
// if the run failed at the metadata or chapter-list level.
|
||||
func (o *Orchestrator) RunBook(ctx context.Context, task domain.ScrapeTask) domain.ScrapeResult {
|
||||
o.log.Info("orchestrator: RunBook starting",
|
||||
"task_id", task.ID,
|
||||
"kind", task.Kind,
|
||||
"url", task.TargetURL,
|
||||
"workers", o.workers,
|
||||
)
|
||||
|
||||
var result domain.ScrapeResult
|
||||
|
||||
if task.TargetURL == "" {
|
||||
result.ErrorMessage = "task has no target URL"
|
||||
return result
|
||||
}
|
||||
|
||||
// ── Step 1: Metadata ──────────────────────────────────────────────────────
|
||||
meta, err := o.novel.ScrapeMetadata(ctx, task.TargetURL)
|
||||
if err != nil {
|
||||
o.log.Error("metadata scrape failed", "url", task.TargetURL, "err", err)
|
||||
result.ErrorMessage = fmt.Sprintf("metadata: %v", err)
|
||||
result.Errors++
|
||||
return result
|
||||
}
|
||||
|
||||
if err := o.store.WriteMetadata(ctx, meta); err != nil {
|
||||
o.log.Error("metadata write failed", "slug", meta.Slug, "err", err)
|
||||
// non-fatal: continue to chapters
|
||||
result.Errors++
|
||||
} else {
|
||||
result.BooksFound = 1
|
||||
// Fire optional post-metadata hook (e.g. Meilisearch indexing).
|
||||
if o.postMetadata != nil {
|
||||
o.postMetadata(ctx, meta)
|
||||
}
|
||||
}
|
||||
|
||||
o.log.Info("metadata saved", "slug", meta.Slug, "title", meta.Title)
|
||||
|
||||
// ── Step 2: Chapter list ──────────────────────────────────────────────────
|
||||
refs, err := o.novel.ScrapeChapterList(ctx, task.TargetURL, task.ToChapter)
|
||||
if err != nil {
|
||||
o.log.Error("chapter list scrape failed", "slug", meta.Slug, "err", err)
|
||||
result.ErrorMessage = fmt.Sprintf("chapter list: %v", err)
|
||||
result.Errors++
|
||||
return result
|
||||
}
|
||||
|
||||
o.log.Info("chapter list fetched", "slug", meta.Slug, "chapters", len(refs))
|
||||
|
||||
// Persist chapter refs (without text) so the index exists early.
|
||||
if wErr := o.store.WriteChapterRefs(ctx, meta.Slug, refs); wErr != nil {
|
||||
o.log.Warn("chapter refs write failed", "slug", meta.Slug, "err", wErr)
|
||||
}
|
||||
|
||||
// ── Step 3: Chapter texts (worker pool) ───────────────────────────────────
|
||||
type chapterJob struct {
|
||||
slug string
|
||||
ref domain.ChapterRef
|
||||
total int // total chapters to scrape (for progress logging)
|
||||
}
|
||||
work := make(chan chapterJob, o.workers*4)
|
||||
|
||||
var scraped, skipped, errors atomic.Int64
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i := 0; i < o.workers; i++ {
|
||||
wg.Add(1)
|
||||
go func(workerID int) {
|
||||
defer wg.Done()
|
||||
for job := range work {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if o.store.ChapterExists(ctx, job.slug, job.ref) {
|
||||
o.log.Debug("chapter already exists, skipping",
|
||||
"slug", job.slug, "chapter", job.ref.Number)
|
||||
skipped.Add(1)
|
||||
continue
|
||||
}
|
||||
|
||||
ch, err := o.novel.ScrapeChapterText(ctx, job.ref)
|
||||
if err != nil {
|
||||
o.log.Error("chapter scrape failed",
|
||||
"slug", job.slug, "chapter", job.ref.Number, "err", err)
|
||||
errors.Add(1)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := o.store.WriteChapter(ctx, job.slug, ch); err != nil {
|
||||
o.log.Error("chapter write failed",
|
||||
"slug", job.slug, "chapter", job.ref.Number, "err", err)
|
||||
errors.Add(1)
|
||||
continue
|
||||
}
|
||||
|
||||
n := scraped.Add(1)
|
||||
// Log a progress summary every 25 chapters scraped.
|
||||
if n%25 == 0 {
|
||||
o.log.Info("scraping chapters",
|
||||
"slug", job.slug, "scraped", n, "total", job.total)
|
||||
}
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
// Count how many chapters will actually be enqueued (for progress logging).
|
||||
toScrape := 0
|
||||
for _, ref := range refs {
|
||||
if task.FromChapter > 0 && ref.Number < task.FromChapter {
|
||||
continue
|
||||
}
|
||||
if task.ToChapter > 0 && ref.Number > task.ToChapter {
|
||||
continue
|
||||
}
|
||||
toScrape++
|
||||
}
|
||||
|
||||
// Enqueue chapter jobs respecting the optional range filter from the task.
|
||||
for _, ref := range refs {
|
||||
if task.FromChapter > 0 && ref.Number < task.FromChapter {
|
||||
skipped.Add(1)
|
||||
continue
|
||||
}
|
||||
if task.ToChapter > 0 && ref.Number > task.ToChapter {
|
||||
skipped.Add(1)
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
goto drain
|
||||
case work <- chapterJob{slug: meta.Slug, ref: ref, total: toScrape}:
|
||||
}
|
||||
}
|
||||
|
||||
drain:
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
result.ChaptersScraped = int(scraped.Load())
|
||||
result.ChaptersSkipped = int(skipped.Load())
|
||||
result.Errors += int(errors.Load())
|
||||
|
||||
o.log.Info("book scrape finished",
|
||||
"slug", meta.Slug,
|
||||
"scraped", result.ChaptersScraped,
|
||||
"skipped", result.ChaptersSkipped,
|
||||
"errors", result.Errors,
|
||||
)
|
||||
return result
|
||||
}
|
||||
210
backend/internal/orchestrator/orchestrator_test.go
Normal file
210
backend/internal/orchestrator/orchestrator_test.go
Normal file
@@ -0,0 +1,210 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// ── stubs ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type stubScraper struct {
|
||||
meta domain.BookMeta
|
||||
metaErr error
|
||||
refs []domain.ChapterRef
|
||||
refsErr error
|
||||
chapters map[int]domain.Chapter
|
||||
chapErr map[int]error
|
||||
}
|
||||
|
||||
func (s *stubScraper) SourceName() string { return "stub" }
|
||||
|
||||
func (s *stubScraper) ScrapeCatalogue(ctx context.Context) (<-chan domain.CatalogueEntry, <-chan error) {
|
||||
ch := make(chan domain.CatalogueEntry)
|
||||
errs := make(chan error)
|
||||
close(ch)
|
||||
close(errs)
|
||||
return ch, errs
|
||||
}
|
||||
|
||||
func (s *stubScraper) ScrapeMetadata(_ context.Context, _ string) (domain.BookMeta, error) {
|
||||
return s.meta, s.metaErr
|
||||
}
|
||||
|
||||
func (s *stubScraper) ScrapeChapterList(_ context.Context, _ string, _ int) ([]domain.ChapterRef, error) {
|
||||
return s.refs, s.refsErr
|
||||
}
|
||||
|
||||
func (s *stubScraper) ScrapeChapterText(_ context.Context, ref domain.ChapterRef) (domain.Chapter, error) {
|
||||
if s.chapErr != nil {
|
||||
if err, ok := s.chapErr[ref.Number]; ok {
|
||||
return domain.Chapter{}, err
|
||||
}
|
||||
}
|
||||
if s.chapters != nil {
|
||||
if ch, ok := s.chapters[ref.Number]; ok {
|
||||
return ch, nil
|
||||
}
|
||||
}
|
||||
return domain.Chapter{Ref: ref, Text: "text"}, nil
|
||||
}
|
||||
|
||||
func (s *stubScraper) ScrapeRanking(ctx context.Context, maxPages int) (<-chan domain.BookMeta, <-chan error) {
|
||||
ch := make(chan domain.BookMeta)
|
||||
errs := make(chan error)
|
||||
close(ch)
|
||||
close(errs)
|
||||
return ch, errs
|
||||
}
|
||||
|
||||
type stubStore struct {
|
||||
mu sync.Mutex
|
||||
metaWritten []domain.BookMeta
|
||||
chaptersWritten []domain.Chapter
|
||||
existing map[string]bool // "slug:N" → exists
|
||||
writeMetaErr error
|
||||
}
|
||||
|
||||
func (s *stubStore) WriteMetadata(_ context.Context, meta domain.BookMeta) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if s.writeMetaErr != nil {
|
||||
return s.writeMetaErr
|
||||
}
|
||||
s.metaWritten = append(s.metaWritten, meta)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubStore) WriteChapter(_ context.Context, slug string, ch domain.Chapter) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.chaptersWritten = append(s.chaptersWritten, ch)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubStore) WriteChapterRefs(_ context.Context, _ string, _ []domain.ChapterRef) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubStore) ChapterExists(_ context.Context, slug string, ref domain.ChapterRef) bool {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
key := slug + ":" + string(rune('0'+ref.Number))
|
||||
return s.existing[key]
|
||||
}
|
||||
|
||||
// ── tests ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func TestRunBook_HappyPath(t *testing.T) {
|
||||
sc := &stubScraper{
|
||||
meta: domain.BookMeta{Slug: "test-book", Title: "Test Book", SourceURL: "https://example.com/book/test-book"},
|
||||
refs: []domain.ChapterRef{
|
||||
{Number: 1, Title: "Ch 1", URL: "https://example.com/book/test-book/chapter-1"},
|
||||
{Number: 2, Title: "Ch 2", URL: "https://example.com/book/test-book/chapter-2"},
|
||||
{Number: 3, Title: "Ch 3", URL: "https://example.com/book/test-book/chapter-3"},
|
||||
},
|
||||
}
|
||||
st := &stubStore{}
|
||||
o := New(Config{Workers: 2}, sc, st, nil)
|
||||
|
||||
task := domain.ScrapeTask{
|
||||
ID: "t1",
|
||||
Kind: "book",
|
||||
TargetURL: "https://example.com/book/test-book",
|
||||
}
|
||||
|
||||
result := o.RunBook(context.Background(), task)
|
||||
|
||||
if result.ErrorMessage != "" {
|
||||
t.Fatalf("unexpected error: %s", result.ErrorMessage)
|
||||
}
|
||||
if result.BooksFound != 1 {
|
||||
t.Errorf("BooksFound = %d, want 1", result.BooksFound)
|
||||
}
|
||||
if result.ChaptersScraped != 3 {
|
||||
t.Errorf("ChaptersScraped = %d, want 3", result.ChaptersScraped)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunBook_MetadataError(t *testing.T) {
|
||||
sc := &stubScraper{metaErr: errors.New("404 not found")}
|
||||
st := &stubStore{}
|
||||
o := New(Config{Workers: 1}, sc, st, nil)
|
||||
|
||||
result := o.RunBook(context.Background(), domain.ScrapeTask{
|
||||
ID: "t2",
|
||||
TargetURL: "https://example.com/book/missing",
|
||||
})
|
||||
|
||||
if result.ErrorMessage == "" {
|
||||
t.Fatal("expected ErrorMessage to be set")
|
||||
}
|
||||
if result.Errors != 1 {
|
||||
t.Errorf("Errors = %d, want 1", result.Errors)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunBook_ChapterRange(t *testing.T) {
|
||||
sc := &stubScraper{
|
||||
meta: domain.BookMeta{Slug: "range-book", SourceURL: "https://example.com/book/range-book"},
|
||||
refs: func() []domain.ChapterRef {
|
||||
var refs []domain.ChapterRef
|
||||
for i := 1; i <= 10; i++ {
|
||||
refs = append(refs, domain.ChapterRef{Number: i, URL: "https://example.com/book/range-book/chapter-" + string(rune('0'+i))})
|
||||
}
|
||||
return refs
|
||||
}(),
|
||||
}
|
||||
st := &stubStore{}
|
||||
o := New(Config{Workers: 2}, sc, st, nil)
|
||||
|
||||
result := o.RunBook(context.Background(), domain.ScrapeTask{
|
||||
ID: "t3",
|
||||
TargetURL: "https://example.com/book/range-book",
|
||||
FromChapter: 3,
|
||||
ToChapter: 7,
|
||||
})
|
||||
|
||||
if result.ErrorMessage != "" {
|
||||
t.Fatalf("unexpected error: %s", result.ErrorMessage)
|
||||
}
|
||||
// chapters 3–7 = 5 scraped, chapters 1-2 and 8-10 = 5 skipped
|
||||
if result.ChaptersScraped != 5 {
|
||||
t.Errorf("ChaptersScraped = %d, want 5", result.ChaptersScraped)
|
||||
}
|
||||
if result.ChaptersSkipped != 5 {
|
||||
t.Errorf("ChaptersSkipped = %d, want 5", result.ChaptersSkipped)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunBook_ContextCancellation(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
sc := &stubScraper{
|
||||
meta: domain.BookMeta{Slug: "ctx-book", SourceURL: "https://example.com/book/ctx-book"},
|
||||
refs: []domain.ChapterRef{
|
||||
{Number: 1, URL: "https://example.com/book/ctx-book/chapter-1"},
|
||||
},
|
||||
}
|
||||
st := &stubStore{}
|
||||
o := New(Config{Workers: 1}, sc, st, nil)
|
||||
|
||||
// Should not panic; result may have errors or zero chapters.
|
||||
result := o.RunBook(ctx, domain.ScrapeTask{
|
||||
ID: "t4",
|
||||
TargetURL: "https://example.com/book/ctx-book",
|
||||
})
|
||||
_ = result
|
||||
}
|
||||
|
||||
func TestRunBook_EmptyTargetURL(t *testing.T) {
|
||||
o := New(Config{Workers: 1}, &stubScraper{}, &stubStore{}, nil)
|
||||
result := o.RunBook(context.Background(), domain.ScrapeTask{ID: "t5"})
|
||||
if result.ErrorMessage == "" {
|
||||
t.Fatal("expected ErrorMessage for empty target URL")
|
||||
}
|
||||
}
|
||||
96
backend/internal/presigncache/cache.go
Normal file
96
backend/internal/presigncache/cache.go
Normal file
@@ -0,0 +1,96 @@
|
||||
// Package presigncache provides a Valkey (Redis-compatible) backed cache for
|
||||
// MinIO presigned URLs. The backend generates presigned URLs and stores them
|
||||
// here with a TTL; subsequent requests for the same key return the cached URL
|
||||
// without re-contacting MinIO.
|
||||
//
|
||||
// Design:
|
||||
// - Cache is intentionally best-effort: Get returns ("", false, nil) on any
|
||||
// Valkey error, so callers always have a fallback path to regenerate.
|
||||
// - Set silently drops errors — a miss on the next request is acceptable.
|
||||
// - TTL should be set shorter than the actual presigned URL lifetime so that
|
||||
// cached URLs are always valid when served. Recommended: 55 minutes for a
|
||||
// 1-hour presigned URL.
|
||||
package presigncache
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Cache is the interface for presign URL caching.
|
||||
// Implementations must be safe for concurrent use.
|
||||
type Cache interface {
|
||||
// Get returns the cached URL for key. ok is false on cache miss or error.
|
||||
Get(ctx context.Context, key string) (url string, ok bool, err error)
|
||||
// Set stores url under key with the given TTL.
|
||||
Set(ctx context.Context, key, url string, ttl time.Duration) error
|
||||
// Delete removes key from the cache.
|
||||
Delete(ctx context.Context, key string) error
|
||||
}
|
||||
|
||||
// ValkeyCache is a Cache backed by Valkey / Redis via go-redis.
|
||||
type ValkeyCache struct {
|
||||
rdb *redis.Client
|
||||
}
|
||||
|
||||
// New creates a ValkeyCache connecting to addr (e.g. "valkey:6379").
|
||||
// The connection is not established until the first command; use Ping to
|
||||
// verify connectivity at startup.
|
||||
func New(addr string) *ValkeyCache {
|
||||
rdb := redis.NewClient(&redis.Options{
|
||||
Addr: addr,
|
||||
DialTimeout: 2 * time.Second,
|
||||
ReadTimeout: 1 * time.Second,
|
||||
WriteTimeout: 1 * time.Second,
|
||||
})
|
||||
return &ValkeyCache{rdb: rdb}
|
||||
}
|
||||
|
||||
// Ping checks connectivity. Call once at startup.
|
||||
func (c *ValkeyCache) Ping(ctx context.Context) error {
|
||||
if err := c.rdb.Ping(ctx).Err(); err != nil {
|
||||
return fmt.Errorf("presigncache: ping valkey: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get returns (url, true, nil) on hit, ("", false, nil) on miss, and
|
||||
// ("", false, err) only on unexpected errors (not redis.Nil).
|
||||
func (c *ValkeyCache) Get(ctx context.Context, key string) (string, bool, error) {
|
||||
val, err := c.rdb.Get(ctx, key).Result()
|
||||
if err == redis.Nil {
|
||||
return "", false, nil
|
||||
}
|
||||
if err != nil {
|
||||
return "", false, fmt.Errorf("presigncache: get %q: %w", key, err)
|
||||
}
|
||||
return val, true, nil
|
||||
}
|
||||
|
||||
// Set stores url under key with ttl. Errors are returned but are non-fatal
|
||||
// for callers — a Set failure means the next request will miss and regenerate.
|
||||
func (c *ValkeyCache) Set(ctx context.Context, key, url string, ttl time.Duration) error {
|
||||
if err := c.rdb.Set(ctx, key, url, ttl).Err(); err != nil {
|
||||
return fmt.Errorf("presigncache: set %q: %w", key, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete removes key from the cache. It is not an error if the key does not exist.
|
||||
func (c *ValkeyCache) Delete(ctx context.Context, key string) error {
|
||||
if err := c.rdb.Del(ctx, key).Err(); err != nil {
|
||||
return fmt.Errorf("presigncache: delete %q: %w", key, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// NoopCache is a no-op Cache that always returns a miss. Used when Valkey is
|
||||
// not configured (e.g. local development without Docker).
|
||||
type NoopCache struct{}
|
||||
|
||||
func (NoopCache) Get(_ context.Context, _ string) (string, bool, error) { return "", false, nil }
|
||||
func (NoopCache) Set(_ context.Context, _, _ string, _ time.Duration) error { return nil }
|
||||
func (NoopCache) Delete(_ context.Context, _ string) error { return nil }
|
||||
185
backend/internal/runner/catalogue_refresh.go
Normal file
185
backend/internal/runner/catalogue_refresh.go
Normal file
@@ -0,0 +1,185 @@
|
||||
package runner
|
||||
|
||||
// catalogue_refresh.go — independent loop that walks the full novelfire.net
|
||||
// catalogue, scrapes per-book metadata, downloads cover images to MinIO, and
|
||||
// indexes every book in Meilisearch.
|
||||
//
|
||||
// Design:
|
||||
// - Runs on its own ticker (CatalogueRefreshInterval, default 24h) inside Run().
|
||||
// - Also fires once on startup.
|
||||
// - ScrapeCatalogue streams CatalogueEntry values over a channel — we iterate
|
||||
// and call ScrapeMetadata for each entry.
|
||||
// - Per-request random jitter (1–3s) prevents hammering novelfire.net.
|
||||
// - Cover images are fetched from the URL embedded in BookMeta.Cover and
|
||||
// stored in MinIO (browse bucket, key: covers/{slug}.jpg).
|
||||
// - WriteMetadata + UpsertBook are called for every successfully scraped book.
|
||||
// - Errors for individual books are logged and skipped; the loop continues.
|
||||
// - The cover URL stored in BookMeta.Cover is rewritten to the internal proxy
|
||||
// path (/api/cover/novelfire.net/{slug}) so the UI always fetches via the
|
||||
// backend, which will serve from MinIO.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// runCatalogueRefresh performs one full catalogue walk: scrapes metadata for
|
||||
// every book on novelfire.net, downloads covers to MinIO, and upserts to
|
||||
// Meilisearch. Errors for individual books are logged and skipped.
|
||||
func (r *Runner) runCatalogueRefresh(ctx context.Context) {
|
||||
if r.deps.Novel == nil {
|
||||
r.deps.Log.Warn("runner: catalogue refresh skipped — Novel scraper not configured")
|
||||
return
|
||||
}
|
||||
if r.deps.BookWriter == nil {
|
||||
r.deps.Log.Warn("runner: catalogue refresh skipped — BookWriter not configured")
|
||||
return
|
||||
}
|
||||
|
||||
log := r.deps.Log.With("op", "catalogue_refresh")
|
||||
log.Info("runner: catalogue refresh starting")
|
||||
|
||||
entries, errCh := r.deps.Novel.ScrapeCatalogue(ctx)
|
||||
|
||||
ok, skipped, errCount := 0, 0, 0
|
||||
for entry := range entries {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
|
||||
// Skip books already present in Meilisearch — they were indexed on a
|
||||
// previous run. Re-indexing only happens when a scrape task is
|
||||
// explicitly enqueued (e.g. via the admin UI or API).
|
||||
if r.deps.SearchIndex.BookExists(ctx, entry.Slug) {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
// Random jitter between books to avoid rate-limiting.
|
||||
jitter := time.Duration(1000+rand.Intn(2000)) * time.Millisecond
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break
|
||||
case <-time.After(jitter):
|
||||
}
|
||||
|
||||
meta, err := r.deps.Novel.ScrapeMetadata(ctx, entry.URL)
|
||||
if err != nil {
|
||||
log.Warn("runner: catalogue refresh: metadata scrape failed",
|
||||
"url", entry.URL, "err", err)
|
||||
errCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// Rewrite cover URL to backend proxy path so UI never hits CDN directly.
|
||||
originalCover := meta.Cover
|
||||
meta.Cover = fmt.Sprintf("/api/cover/novelfire.net/%s", meta.Slug)
|
||||
|
||||
// Persist to PocketBase.
|
||||
if err := r.deps.BookWriter.WriteMetadata(ctx, meta); err != nil {
|
||||
log.Warn("runner: catalogue refresh: WriteMetadata failed",
|
||||
"slug", meta.Slug, "err", err)
|
||||
errCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// Index in Meilisearch.
|
||||
if err := r.deps.SearchIndex.UpsertBook(ctx, meta); err != nil {
|
||||
log.Warn("runner: catalogue refresh: UpsertBook failed",
|
||||
"slug", meta.Slug, "err", err)
|
||||
// non-fatal — continue
|
||||
}
|
||||
|
||||
// Download and store cover image in MinIO if we have a cover URL
|
||||
// and a CoverStore is wired in.
|
||||
if r.deps.CoverStore != nil && originalCover != "" {
|
||||
if !r.deps.CoverStore.CoverExists(ctx, meta.Slug) {
|
||||
if err := r.downloadCover(ctx, meta.Slug, originalCover); err != nil {
|
||||
log.Warn("runner: catalogue refresh: cover download failed",
|
||||
"slug", meta.Slug, "url", originalCover, "err", err)
|
||||
// non-fatal
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ok++
|
||||
if ok%100 == 0 {
|
||||
log.Info("runner: catalogue refresh progress",
|
||||
"scraped", ok, "errors", errCount)
|
||||
}
|
||||
}
|
||||
|
||||
if err := <-errCh; err != nil {
|
||||
log.Warn("runner: catalogue refresh: catalogue stream error", "err", err)
|
||||
}
|
||||
|
||||
log.Info("runner: catalogue refresh finished",
|
||||
"ok", ok, "skipped", skipped, "errors", errCount)
|
||||
}
|
||||
|
||||
// downloadCover fetches the cover image from coverURL and stores it in MinIO
|
||||
// under covers/{slug}.jpg. It retries up to 3 times with exponential backoff
|
||||
// on transient errors (5xx, network failures).
|
||||
func (r *Runner) downloadCover(ctx context.Context, slug, coverURL string) error {
|
||||
const maxRetries = 3
|
||||
delay := 2 * time.Second
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if attempt > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(delay):
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
|
||||
data, err := fetchCoverBytes(ctx, coverURL)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
|
||||
if err := r.deps.CoverStore.PutCover(ctx, slug, data, ""); err != nil {
|
||||
return fmt.Errorf("put cover: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("download cover after %d retries: %w", maxRetries, lastErr)
|
||||
}
|
||||
|
||||
// fetchCoverBytes performs a single HTTP GET for coverURL and returns the body.
|
||||
func fetchCoverBytes(ctx context.Context, coverURL string) ([]byte, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, coverURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; libnovel-runner/2)")
|
||||
req.Header.Set("Referer", "https://novelfire.net/")
|
||||
|
||||
client := &http.Client{Timeout: 30 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("http get: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 500 {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
return nil, fmt.Errorf("upstream %d for %s", resp.StatusCode, coverURL)
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
return nil, fmt.Errorf("unexpected status %d for %s", resp.StatusCode, coverURL)
|
||||
}
|
||||
|
||||
return io.ReadAll(io.LimitReader(resp.Body, 5<<20)) // 5 MiB cap
|
||||
}
|
||||
21
backend/internal/runner/helpers.go
Normal file
21
backend/internal/runner/helpers.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// stripMarkdown removes common markdown syntax from src, returning plain text
|
||||
// suitable for TTS. Mirrors the helper in the scraper's server package.
|
||||
func stripMarkdown(src string) string {
|
||||
src = regexp.MustCompile(`(?m)^#{1,6}\s+`).ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile(`\*{1,3}|_{1,3}`).ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile("(?s)```.*?```").ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile("`[^`]*`").ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(src, "$1")
|
||||
src = regexp.MustCompile(`!\[[^\]]*\]\([^)]+\)`).ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile(`(?m)^>\s?`).ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile(`(?m)^[-*_]{3,}\s*$`).ReplaceAllString(src, "")
|
||||
src = regexp.MustCompile(`\n{3,}`).ReplaceAllString(src, "\n\n")
|
||||
return strings.TrimSpace(src)
|
||||
}
|
||||
92
backend/internal/runner/metrics.go
Normal file
92
backend/internal/runner/metrics.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package runner
|
||||
|
||||
// metrics.go — lightweight HTTP metrics endpoint for the runner.
|
||||
//
|
||||
// GET /metrics returns a JSON document with live task counters and uptime.
|
||||
// No external dependency (no Prometheus); plain net/http only.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// metricsServer serves GET /metrics for the runner process.
|
||||
type metricsServer struct {
|
||||
addr string
|
||||
r *Runner
|
||||
log *slog.Logger
|
||||
}
|
||||
|
||||
func newMetricsServer(addr string, r *Runner, log *slog.Logger) *metricsServer {
|
||||
return &metricsServer{addr: addr, r: r, log: log}
|
||||
}
|
||||
|
||||
// ListenAndServe starts the HTTP server and blocks until ctx is cancelled or
|
||||
// a fatal listen error occurs.
|
||||
func (ms *metricsServer) ListenAndServe(ctx context.Context) error {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /metrics", ms.handleMetrics)
|
||||
mux.HandleFunc("GET /health", ms.handleHealth)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: ms.addr,
|
||||
Handler: mux,
|
||||
ReadTimeout: 5 * time.Second,
|
||||
WriteTimeout: 5 * time.Second,
|
||||
BaseContext: func(_ net.Listener) context.Context { return ctx },
|
||||
}
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
ms.log.Info("runner: metrics server listening", "addr", ms.addr)
|
||||
errCh <- srv.ListenAndServe()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
shutCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = srv.Shutdown(shutCtx)
|
||||
return nil
|
||||
case err := <-errCh:
|
||||
return fmt.Errorf("runner: metrics server: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// handleMetrics handles GET /metrics.
|
||||
// Response shape (JSON):
|
||||
//
|
||||
// {
|
||||
// "tasks_running": N,
|
||||
// "tasks_completed": N,
|
||||
// "tasks_failed": N,
|
||||
// "uptime_seconds": N
|
||||
// }
|
||||
func (ms *metricsServer) handleMetrics(w http.ResponseWriter, _ *http.Request) {
|
||||
uptimeSec := int64(time.Since(ms.r.startedAt).Seconds())
|
||||
metricsWriteJSON(w, 0, map[string]int64{
|
||||
"tasks_running": ms.r.tasksRunning.Load(),
|
||||
"tasks_completed": ms.r.tasksCompleted.Load(),
|
||||
"tasks_failed": ms.r.tasksFailed.Load(),
|
||||
"uptime_seconds": uptimeSec,
|
||||
})
|
||||
}
|
||||
|
||||
// handleHealth handles GET /health — simple liveness probe for the metrics server.
|
||||
func (ms *metricsServer) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
metricsWriteJSON(w, 0, map[string]string{"status": "ok"})
|
||||
}
|
||||
|
||||
// metricsWriteJSON writes v as a JSON response with the given status code.
|
||||
func metricsWriteJSON(w http.ResponseWriter, status int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if status != 0 {
|
||||
w.WriteHeader(status)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
442
backend/internal/runner/runner.go
Normal file
442
backend/internal/runner/runner.go
Normal file
@@ -0,0 +1,442 @@
|
||||
// Package runner implements the worker loop that polls PocketBase for pending
|
||||
// scrape and audio tasks, executes them, and reports results back.
|
||||
//
|
||||
// Design:
|
||||
// - Run(ctx) loops on a ticker; each tick claims and dispatches pending tasks.
|
||||
// - Scrape tasks are dispatched to the Orchestrator (one goroutine per task,
|
||||
// up to MaxConcurrentScrape).
|
||||
// - Audio tasks fetch chapter text, call Kokoro, upload to MinIO, and report
|
||||
// the result back (up to MaxConcurrentAudio goroutines).
|
||||
// - The runner is stateless between ticks; all state lives in PocketBase.
|
||||
// - Atomic task counters are exposed via /metrics (see metrics.go).
|
||||
// - Books are indexed in Meilisearch via an orchestrator.Config.PostMetadata
|
||||
// hook injected at construction time.
|
||||
package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/kokoro"
|
||||
"github.com/libnovel/backend/internal/meili"
|
||||
"github.com/libnovel/backend/internal/orchestrator"
|
||||
"github.com/libnovel/backend/internal/scraper"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
|
||||
// Config tunes the runner behaviour.
|
||||
type Config struct {
|
||||
// WorkerID uniquely identifies this runner instance in PocketBase records.
|
||||
WorkerID string
|
||||
// PollInterval is how often the runner checks for new tasks.
|
||||
PollInterval time.Duration
|
||||
// MaxConcurrentScrape limits simultaneous book-scrape goroutines.
|
||||
MaxConcurrentScrape int
|
||||
// MaxConcurrentAudio limits simultaneous audio-generation goroutines.
|
||||
MaxConcurrentAudio int
|
||||
// OrchestratorWorkers is the chapter-scraping parallelism inside each book run.
|
||||
OrchestratorWorkers int
|
||||
// HeartbeatInterval is how often active tasks PATCH their heartbeat_at
|
||||
// timestamp to signal they are still alive. Defaults to 30s when 0.
|
||||
HeartbeatInterval time.Duration
|
||||
// StaleTaskThreshold is how old a heartbeat must be (or absent) before the
|
||||
// task is considered orphaned and reset to pending. Defaults to 2m when 0.
|
||||
StaleTaskThreshold time.Duration
|
||||
// CatalogueRefreshInterval is how often the runner walks the full catalogue,
|
||||
// scrapes per-book metadata, downloads covers, and re-indexes everything in
|
||||
// Meilisearch. Defaults to 24h (expensive — full catalogue walk).
|
||||
CatalogueRefreshInterval time.Duration
|
||||
// SkipInitialCatalogueRefresh suppresses the immediate catalogue walk that
|
||||
// otherwise fires at startup. The periodic ticker (CatalogueRefreshInterval)
|
||||
// still fires normally. Set RUNNER_SKIP_INITIAL_CATALOGUE_REFRESH=true for
|
||||
// quick restarts where the catalogue is already up to date.
|
||||
SkipInitialCatalogueRefresh bool
|
||||
// MetricsAddr is the HTTP listen address for the /metrics endpoint.
|
||||
// Defaults to ":9091". Set to "" to disable.
|
||||
MetricsAddr string
|
||||
}
|
||||
|
||||
// Dependencies are the external services the runner depends on.
|
||||
type Dependencies struct {
|
||||
// Consumer claims tasks from PocketBase.
|
||||
Consumer taskqueue.Consumer
|
||||
// BookWriter persists scraped data (used by orchestrator).
|
||||
BookWriter bookstore.BookWriter
|
||||
// BookReader reads chapter text for audio generation.
|
||||
BookReader bookstore.BookReader
|
||||
// AudioStore persists generated audio and checks key existence.
|
||||
AudioStore bookstore.AudioStore
|
||||
// CoverStore stores book cover images in MinIO.
|
||||
CoverStore bookstore.CoverStore
|
||||
// SearchIndex indexes books in Meilisearch after scraping.
|
||||
// If nil a no-op is used.
|
||||
SearchIndex meili.Client
|
||||
// Novel is the scraper implementation.
|
||||
Novel scraper.NovelScraper
|
||||
// Kokoro is the TTS client.
|
||||
Kokoro kokoro.Client
|
||||
// Log is the structured logger.
|
||||
Log *slog.Logger
|
||||
}
|
||||
|
||||
// Runner is the main worker process.
|
||||
type Runner struct {
|
||||
cfg Config
|
||||
deps Dependencies
|
||||
|
||||
// Atomic task counters — read by /metrics without locking.
|
||||
tasksRunning atomic.Int64
|
||||
tasksCompleted atomic.Int64
|
||||
tasksFailed atomic.Int64
|
||||
|
||||
startedAt time.Time
|
||||
}
|
||||
|
||||
// New creates a Runner from cfg and deps.
|
||||
func New(cfg Config, deps Dependencies) *Runner {
|
||||
if cfg.PollInterval <= 0 {
|
||||
cfg.PollInterval = 30 * time.Second
|
||||
}
|
||||
if cfg.MaxConcurrentScrape <= 0 {
|
||||
cfg.MaxConcurrentScrape = 2
|
||||
}
|
||||
if cfg.MaxConcurrentAudio <= 0 {
|
||||
cfg.MaxConcurrentAudio = 1
|
||||
}
|
||||
if cfg.WorkerID == "" {
|
||||
cfg.WorkerID = "runner"
|
||||
}
|
||||
if cfg.HeartbeatInterval <= 0 {
|
||||
cfg.HeartbeatInterval = 30 * time.Second
|
||||
}
|
||||
if cfg.StaleTaskThreshold <= 0 {
|
||||
cfg.StaleTaskThreshold = 2 * time.Minute
|
||||
}
|
||||
if cfg.CatalogueRefreshInterval <= 0 {
|
||||
cfg.CatalogueRefreshInterval = 24 * time.Hour
|
||||
}
|
||||
if cfg.MetricsAddr == "" {
|
||||
cfg.MetricsAddr = ":9091"
|
||||
}
|
||||
if deps.Log == nil {
|
||||
deps.Log = slog.Default()
|
||||
}
|
||||
if deps.SearchIndex == nil {
|
||||
deps.SearchIndex = meili.NoopClient{}
|
||||
}
|
||||
return &Runner{cfg: cfg, deps: deps, startedAt: time.Now()}
|
||||
}
|
||||
|
||||
// Run starts the poll loop and the metrics HTTP server, blocking until ctx is
|
||||
// cancelled.
|
||||
func (r *Runner) Run(ctx context.Context) error {
|
||||
r.deps.Log.Info("runner: starting",
|
||||
"worker_id", r.cfg.WorkerID,
|
||||
"poll_interval", r.cfg.PollInterval,
|
||||
"max_scrape", r.cfg.MaxConcurrentScrape,
|
||||
"max_audio", r.cfg.MaxConcurrentAudio,
|
||||
"catalogue_refresh_interval", r.cfg.CatalogueRefreshInterval,
|
||||
"metrics_addr", r.cfg.MetricsAddr,
|
||||
)
|
||||
|
||||
// Start metrics HTTP server in background if configured.
|
||||
if r.cfg.MetricsAddr != "" {
|
||||
ms := newMetricsServer(r.cfg.MetricsAddr, r, r.deps.Log)
|
||||
go func() {
|
||||
if err := ms.ListenAndServe(ctx); err != nil {
|
||||
r.deps.Log.Error("runner: metrics server error", "err", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
scrapeSem := make(chan struct{}, r.cfg.MaxConcurrentScrape)
|
||||
audioSem := make(chan struct{}, r.cfg.MaxConcurrentAudio)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
tick := time.NewTicker(r.cfg.PollInterval)
|
||||
defer tick.Stop()
|
||||
|
||||
catalogueTick := time.NewTicker(r.cfg.CatalogueRefreshInterval)
|
||||
defer catalogueTick.Stop()
|
||||
|
||||
// Run one catalogue refresh immediately on startup (unless skipped by flag).
|
||||
if !r.cfg.SkipInitialCatalogueRefresh {
|
||||
go r.runCatalogueRefresh(ctx)
|
||||
} else {
|
||||
r.deps.Log.Info("runner: skipping initial catalogue refresh (RUNNER_SKIP_INITIAL_CATALOGUE_REFRESH=true)")
|
||||
}
|
||||
|
||||
// Run one poll immediately on startup, then on each tick.
|
||||
for {
|
||||
r.poll(ctx, scrapeSem, audioSem, &wg)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
r.deps.Log.Info("runner: context cancelled, draining active tasks")
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
r.deps.Log.Info("runner: all tasks drained, exiting")
|
||||
case <-time.After(2 * time.Minute):
|
||||
r.deps.Log.Warn("runner: drain timeout exceeded, forcing exit")
|
||||
}
|
||||
return nil
|
||||
case <-catalogueTick.C:
|
||||
go r.runCatalogueRefresh(ctx)
|
||||
case <-tick.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// poll claims all available pending tasks and dispatches them to goroutines.
|
||||
func (r *Runner) poll(ctx context.Context, scrapeSem, audioSem chan struct{}, wg *sync.WaitGroup) {
|
||||
// ── Heartbeat file ────────────────────────────────────────────────────
|
||||
// Touch /tmp/runner.alive so the Docker health check can confirm the
|
||||
// runner is actively polling. Failure is non-fatal — just log it.
|
||||
if f, err := os.Create("/tmp/runner.alive"); err != nil {
|
||||
r.deps.Log.Warn("runner: could not write heartbeat file", "err", err)
|
||||
} else {
|
||||
f.Close()
|
||||
}
|
||||
|
||||
// ── Reap orphaned tasks ───────────────────────────────────────────────
|
||||
if n, err := r.deps.Consumer.ReapStaleTasks(ctx, r.cfg.StaleTaskThreshold); err != nil {
|
||||
r.deps.Log.Warn("runner: reap stale tasks failed", "err", err)
|
||||
} else if n > 0 {
|
||||
r.deps.Log.Info("runner: reaped stale tasks", "count", n)
|
||||
}
|
||||
|
||||
// ── Scrape tasks ──────────────────────────────────────────────────────
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
task, ok, err := r.deps.Consumer.ClaimNextScrapeTask(ctx, r.cfg.WorkerID)
|
||||
if err != nil {
|
||||
r.deps.Log.Error("runner: ClaimNextScrapeTask failed", "err", err)
|
||||
break
|
||||
}
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case scrapeSem <- struct{}{}:
|
||||
default:
|
||||
r.deps.Log.Warn("runner: scrape semaphore full, will retry next tick",
|
||||
"task_id", task.ID)
|
||||
break
|
||||
}
|
||||
r.tasksRunning.Add(1)
|
||||
wg.Add(1)
|
||||
go func(t domain.ScrapeTask) {
|
||||
defer wg.Done()
|
||||
defer func() { <-scrapeSem }()
|
||||
defer r.tasksRunning.Add(-1)
|
||||
r.runScrapeTask(ctx, t)
|
||||
}(task)
|
||||
}
|
||||
|
||||
// ── Audio tasks ───────────────────────────────────────────────────────
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
task, ok, err := r.deps.Consumer.ClaimNextAudioTask(ctx, r.cfg.WorkerID)
|
||||
if err != nil {
|
||||
r.deps.Log.Error("runner: ClaimNextAudioTask failed", "err", err)
|
||||
break
|
||||
}
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case audioSem <- struct{}{}:
|
||||
default:
|
||||
r.deps.Log.Warn("runner: audio semaphore full, will retry next tick",
|
||||
"task_id", task.ID)
|
||||
break
|
||||
}
|
||||
r.tasksRunning.Add(1)
|
||||
wg.Add(1)
|
||||
go func(t domain.AudioTask) {
|
||||
defer wg.Done()
|
||||
defer func() { <-audioSem }()
|
||||
defer r.tasksRunning.Add(-1)
|
||||
r.runAudioTask(ctx, t)
|
||||
}(task)
|
||||
}
|
||||
}
|
||||
|
||||
// newOrchestrator builds an orchestrator with the Meilisearch post-hook wired in.
|
||||
func (r *Runner) newOrchestrator() *orchestrator.Orchestrator {
|
||||
oCfg := orchestrator.Config{
|
||||
Workers: r.cfg.OrchestratorWorkers,
|
||||
PostMetadata: func(ctx context.Context, meta domain.BookMeta) {
|
||||
if err := r.deps.SearchIndex.UpsertBook(ctx, meta); err != nil {
|
||||
r.deps.Log.Warn("runner: meilisearch upsert failed",
|
||||
"slug", meta.Slug, "err", err)
|
||||
}
|
||||
},
|
||||
}
|
||||
return orchestrator.New(oCfg, r.deps.Novel, r.deps.BookWriter, r.deps.Log)
|
||||
}
|
||||
|
||||
// runScrapeTask executes one scrape task end-to-end and reports the result.
|
||||
func (r *Runner) runScrapeTask(ctx context.Context, task domain.ScrapeTask) {
|
||||
log := r.deps.Log.With("task_id", task.ID, "kind", task.Kind, "url", task.TargetURL)
|
||||
log.Info("runner: scrape task starting")
|
||||
|
||||
hbCtx, hbCancel := context.WithCancel(ctx)
|
||||
defer hbCancel()
|
||||
go func() {
|
||||
tick := time.NewTicker(r.cfg.HeartbeatInterval)
|
||||
defer tick.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-hbCtx.Done():
|
||||
return
|
||||
case <-tick.C:
|
||||
if err := r.deps.Consumer.HeartbeatTask(ctx, task.ID); err != nil {
|
||||
log.Warn("runner: heartbeat failed", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
o := r.newOrchestrator()
|
||||
var result domain.ScrapeResult
|
||||
|
||||
switch task.Kind {
|
||||
case "catalogue":
|
||||
result = r.runCatalogueTask(ctx, task, o, log)
|
||||
case "book", "book_range":
|
||||
result = o.RunBook(ctx, task)
|
||||
default:
|
||||
result.ErrorMessage = fmt.Sprintf("unknown task kind: %q", task.Kind)
|
||||
log.Warn("runner: unknown task kind")
|
||||
}
|
||||
|
||||
if err := r.deps.Consumer.FinishScrapeTask(ctx, task.ID, result); err != nil {
|
||||
log.Error("runner: FinishScrapeTask failed", "err", err)
|
||||
}
|
||||
|
||||
if result.ErrorMessage != "" {
|
||||
r.tasksFailed.Add(1)
|
||||
} else {
|
||||
r.tasksCompleted.Add(1)
|
||||
}
|
||||
|
||||
log.Info("runner: scrape task finished",
|
||||
"scraped", result.ChaptersScraped,
|
||||
"skipped", result.ChaptersSkipped,
|
||||
"errors", result.Errors,
|
||||
)
|
||||
}
|
||||
|
||||
// runCatalogueTask runs a full catalogue scrape.
|
||||
func (r *Runner) runCatalogueTask(ctx context.Context, task domain.ScrapeTask, o *orchestrator.Orchestrator, log *slog.Logger) domain.ScrapeResult {
|
||||
entries, errCh := r.deps.Novel.ScrapeCatalogue(ctx)
|
||||
var result domain.ScrapeResult
|
||||
|
||||
for entry := range entries {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
bookTask := domain.ScrapeTask{
|
||||
ID: task.ID,
|
||||
Kind: "book",
|
||||
TargetURL: entry.URL,
|
||||
}
|
||||
bookResult := o.RunBook(ctx, bookTask)
|
||||
result.BooksFound += bookResult.BooksFound + 1
|
||||
result.ChaptersScraped += bookResult.ChaptersScraped
|
||||
result.ChaptersSkipped += bookResult.ChaptersSkipped
|
||||
result.Errors += bookResult.Errors
|
||||
}
|
||||
|
||||
if err := <-errCh; err != nil {
|
||||
log.Warn("runner: catalogue scrape finished with error", "err", err)
|
||||
result.Errors++
|
||||
if result.ErrorMessage == "" {
|
||||
result.ErrorMessage = err.Error()
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// runAudioTask executes one audio-generation task.
|
||||
func (r *Runner) runAudioTask(ctx context.Context, task domain.AudioTask) {
|
||||
log := r.deps.Log.With("task_id", task.ID, "slug", task.Slug, "chapter", task.Chapter, "voice", task.Voice)
|
||||
log.Info("runner: audio task starting")
|
||||
|
||||
hbCtx, hbCancel := context.WithCancel(ctx)
|
||||
defer hbCancel()
|
||||
go func() {
|
||||
tick := time.NewTicker(r.cfg.HeartbeatInterval)
|
||||
defer tick.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-hbCtx.Done():
|
||||
return
|
||||
case <-tick.C:
|
||||
if err := r.deps.Consumer.HeartbeatTask(ctx, task.ID); err != nil {
|
||||
log.Warn("runner: heartbeat failed", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
fail := func(msg string) {
|
||||
log.Error("runner: audio task failed", "reason", msg)
|
||||
r.tasksFailed.Add(1)
|
||||
result := domain.AudioResult{ErrorMessage: msg}
|
||||
if err := r.deps.Consumer.FinishAudioTask(ctx, task.ID, result); err != nil {
|
||||
log.Error("runner: FinishAudioTask failed", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
raw, err := r.deps.BookReader.ReadChapter(ctx, task.Slug, task.Chapter)
|
||||
if err != nil {
|
||||
fail(fmt.Sprintf("read chapter: %v", err))
|
||||
return
|
||||
}
|
||||
text := stripMarkdown(raw)
|
||||
if text == "" {
|
||||
fail("chapter text is empty after stripping markdown")
|
||||
return
|
||||
}
|
||||
|
||||
if r.deps.Kokoro == nil {
|
||||
fail("kokoro client not configured")
|
||||
return
|
||||
}
|
||||
audioData, err := r.deps.Kokoro.GenerateAudio(ctx, text, task.Voice)
|
||||
if err != nil {
|
||||
fail(fmt.Sprintf("kokoro generate: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
key := r.deps.AudioStore.AudioObjectKey(task.Slug, task.Chapter, task.Voice)
|
||||
if err := r.deps.AudioStore.PutAudio(ctx, key, audioData); err != nil {
|
||||
fail(fmt.Sprintf("put audio: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
r.tasksCompleted.Add(1)
|
||||
result := domain.AudioResult{ObjectKey: key}
|
||||
if err := r.deps.Consumer.FinishAudioTask(ctx, task.ID, result); err != nil {
|
||||
log.Error("runner: FinishAudioTask failed", "err", err)
|
||||
}
|
||||
log.Info("runner: audio task finished", "key", key)
|
||||
}
|
||||
365
backend/internal/runner/runner_test.go
Normal file
365
backend/internal/runner/runner_test.go
Normal file
@@ -0,0 +1,365 @@
|
||||
package runner_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/runner"
|
||||
)
|
||||
|
||||
// ── Stub types ────────────────────────────────────────────────────────────────
|
||||
|
||||
// stubConsumer is a test double for taskqueue.Consumer.
|
||||
type stubConsumer struct {
|
||||
scrapeQueue []domain.ScrapeTask
|
||||
audioQueue []domain.AudioTask
|
||||
scrapeIdx int
|
||||
audioIdx int
|
||||
finished []string
|
||||
failCalled []string
|
||||
claimErr error
|
||||
}
|
||||
|
||||
func (s *stubConsumer) ClaimNextScrapeTask(_ context.Context, _ string) (domain.ScrapeTask, bool, error) {
|
||||
if s.claimErr != nil {
|
||||
return domain.ScrapeTask{}, false, s.claimErr
|
||||
}
|
||||
if s.scrapeIdx >= len(s.scrapeQueue) {
|
||||
return domain.ScrapeTask{}, false, nil
|
||||
}
|
||||
t := s.scrapeQueue[s.scrapeIdx]
|
||||
s.scrapeIdx++
|
||||
return t, true, nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) ClaimNextAudioTask(_ context.Context, _ string) (domain.AudioTask, bool, error) {
|
||||
if s.claimErr != nil {
|
||||
return domain.AudioTask{}, false, s.claimErr
|
||||
}
|
||||
if s.audioIdx >= len(s.audioQueue) {
|
||||
return domain.AudioTask{}, false, nil
|
||||
}
|
||||
t := s.audioQueue[s.audioIdx]
|
||||
s.audioIdx++
|
||||
return t, true, nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FinishScrapeTask(_ context.Context, id string, _ domain.ScrapeResult) error {
|
||||
s.finished = append(s.finished, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FinishAudioTask(_ context.Context, id string, _ domain.AudioResult) error {
|
||||
s.finished = append(s.finished, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) FailTask(_ context.Context, id, _ string) error {
|
||||
s.failCalled = append(s.failCalled, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubConsumer) HeartbeatTask(_ context.Context, _ string) error { return nil }
|
||||
|
||||
func (s *stubConsumer) ReapStaleTasks(_ context.Context, _ time.Duration) (int, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// stubBookWriter satisfies bookstore.BookWriter (no-op).
|
||||
type stubBookWriter struct{}
|
||||
|
||||
func (s *stubBookWriter) WriteMetadata(_ context.Context, _ domain.BookMeta) error { return nil }
|
||||
func (s *stubBookWriter) WriteChapter(_ context.Context, _ string, _ domain.Chapter) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubBookWriter) WriteChapterRefs(_ context.Context, _ string, _ []domain.ChapterRef) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubBookWriter) ChapterExists(_ context.Context, _ string, _ domain.ChapterRef) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// stubBookReader satisfies bookstore.BookReader — returns a single chapter.
|
||||
type stubBookReader struct {
|
||||
text string
|
||||
readErr error
|
||||
}
|
||||
|
||||
func (s *stubBookReader) ReadChapter(_ context.Context, _ string, _ int) (string, error) {
|
||||
return s.text, s.readErr
|
||||
}
|
||||
func (s *stubBookReader) ReadMetadata(_ context.Context, _ string) (domain.BookMeta, bool, error) {
|
||||
return domain.BookMeta{}, false, nil
|
||||
}
|
||||
func (s *stubBookReader) ListBooks(_ context.Context) ([]domain.BookMeta, error) { return nil, nil }
|
||||
func (s *stubBookReader) LocalSlugs(_ context.Context) (map[string]bool, error) { return nil, nil }
|
||||
func (s *stubBookReader) MetadataMtime(_ context.Context, _ string) int64 { return 0 }
|
||||
func (s *stubBookReader) ListChapters(_ context.Context, _ string) ([]domain.ChapterInfo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (s *stubBookReader) CountChapters(_ context.Context, _ string) int { return 0 }
|
||||
func (s *stubBookReader) ReindexChapters(_ context.Context, _ string) (int, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// stubAudioStore satisfies bookstore.AudioStore.
|
||||
type stubAudioStore struct {
|
||||
putCalled atomic.Int32
|
||||
putErr error
|
||||
}
|
||||
|
||||
func (s *stubAudioStore) AudioObjectKey(slug string, n int, voice string) string {
|
||||
return slug + "/" + string(rune('0'+n)) + "/" + voice + ".mp3"
|
||||
}
|
||||
func (s *stubAudioStore) AudioExists(_ context.Context, _ string) bool { return false }
|
||||
func (s *stubAudioStore) PutAudio(_ context.Context, _ string, _ []byte) error {
|
||||
s.putCalled.Add(1)
|
||||
return s.putErr
|
||||
}
|
||||
|
||||
// stubNovelScraper satisfies scraper.NovelScraper minimally.
|
||||
type stubNovelScraper struct {
|
||||
entries []domain.CatalogueEntry
|
||||
metaErr error
|
||||
chapters []domain.ChapterRef
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) ScrapeCatalogue(_ context.Context) (<-chan domain.CatalogueEntry, <-chan error) {
|
||||
ch := make(chan domain.CatalogueEntry, len(s.entries))
|
||||
errCh := make(chan error, 1)
|
||||
for _, e := range s.entries {
|
||||
ch <- e
|
||||
}
|
||||
close(ch)
|
||||
close(errCh)
|
||||
return ch, errCh
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) ScrapeMetadata(_ context.Context, _ string) (domain.BookMeta, error) {
|
||||
if s.metaErr != nil {
|
||||
return domain.BookMeta{}, s.metaErr
|
||||
}
|
||||
return domain.BookMeta{Slug: "test-book", Title: "Test Book", SourceURL: "https://example.com/book/test-book"}, nil
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) ScrapeChapterList(_ context.Context, _ string, _ int) ([]domain.ChapterRef, error) {
|
||||
return s.chapters, nil
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) ScrapeChapterText(_ context.Context, ref domain.ChapterRef) (domain.Chapter, error) {
|
||||
return domain.Chapter{Ref: ref, Text: "# Chapter\n\nSome text."}, nil
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) ScrapeRanking(_ context.Context, _ int) (<-chan domain.BookMeta, <-chan error) {
|
||||
ch := make(chan domain.BookMeta)
|
||||
errCh := make(chan error, 1)
|
||||
close(ch)
|
||||
close(errCh)
|
||||
return ch, errCh
|
||||
}
|
||||
|
||||
func (s *stubNovelScraper) SourceName() string { return "stub" }
|
||||
|
||||
// stubKokoro satisfies kokoro.Client.
|
||||
type stubKokoro struct {
|
||||
data []byte
|
||||
genErr error
|
||||
called atomic.Int32
|
||||
}
|
||||
|
||||
func (s *stubKokoro) GenerateAudio(_ context.Context, _, _ string) ([]byte, error) {
|
||||
s.called.Add(1)
|
||||
return s.data, s.genErr
|
||||
}
|
||||
|
||||
func (s *stubKokoro) ListVoices(_ context.Context) ([]string, error) {
|
||||
return []string{"af_bella"}, nil
|
||||
}
|
||||
|
||||
// ── stripMarkdown helper ──────────────────────────────────────────────────────
|
||||
|
||||
func TestStripMarkdownViaAudioTask(t *testing.T) {
|
||||
// Verify markdown is stripped before sending to Kokoro.
|
||||
// We inject chapter text with markdown; the kokoro stub verifies data flows.
|
||||
consumer := &stubConsumer{
|
||||
audioQueue: []domain.AudioTask{
|
||||
{ID: "a1", Slug: "book", Chapter: 1, Voice: "af_bella", Status: domain.TaskStatusRunning},
|
||||
},
|
||||
}
|
||||
bookReader := &stubBookReader{text: "## Chapter 1\n\nPlain **text** here."}
|
||||
audioStore := &stubAudioStore{}
|
||||
kokoroStub := &stubKokoro{data: []byte("mp3")}
|
||||
|
||||
cfg := runner.Config{
|
||||
WorkerID: "test",
|
||||
PollInterval: time.Hour, // long poll — we'll cancel manually
|
||||
}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: bookReader,
|
||||
AudioStore: audioStore,
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: kokoroStub,
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = r.Run(ctx)
|
||||
|
||||
if kokoroStub.called.Load() != 1 {
|
||||
t.Errorf("expected Kokoro.GenerateAudio called once, got %d", kokoroStub.called.Load())
|
||||
}
|
||||
if audioStore.putCalled.Load() != 1 {
|
||||
t.Errorf("expected PutAudio called once, got %d", audioStore.putCalled.Load())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioTask_ReadChapterError(t *testing.T) {
|
||||
consumer := &stubConsumer{
|
||||
audioQueue: []domain.AudioTask{
|
||||
{ID: "a2", Slug: "book", Chapter: 2, Voice: "af_bella", Status: domain.TaskStatusRunning},
|
||||
},
|
||||
}
|
||||
bookReader := &stubBookReader{readErr: errors.New("chapter not found")}
|
||||
audioStore := &stubAudioStore{}
|
||||
kokoroStub := &stubKokoro{data: []byte("mp3")}
|
||||
|
||||
cfg := runner.Config{WorkerID: "test", PollInterval: time.Hour}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: bookReader,
|
||||
AudioStore: audioStore,
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: kokoroStub,
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = r.Run(ctx)
|
||||
|
||||
// Kokoro should not be called; FinishAudioTask should be called with error.
|
||||
if kokoroStub.called.Load() != 0 {
|
||||
t.Errorf("expected Kokoro not called, got %d", kokoroStub.called.Load())
|
||||
}
|
||||
if len(consumer.finished) != 1 {
|
||||
t.Errorf("expected FinishAudioTask called once, got %d", len(consumer.finished))
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioTask_KokoroError(t *testing.T) {
|
||||
consumer := &stubConsumer{
|
||||
audioQueue: []domain.AudioTask{
|
||||
{ID: "a3", Slug: "book", Chapter: 3, Voice: "af_bella", Status: domain.TaskStatusRunning},
|
||||
},
|
||||
}
|
||||
bookReader := &stubBookReader{text: "Chapter text."}
|
||||
audioStore := &stubAudioStore{}
|
||||
kokoroStub := &stubKokoro{genErr: errors.New("tts failed")}
|
||||
|
||||
cfg := runner.Config{WorkerID: "test", PollInterval: time.Hour}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: bookReader,
|
||||
AudioStore: audioStore,
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: kokoroStub,
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = r.Run(ctx)
|
||||
|
||||
if audioStore.putCalled.Load() != 0 {
|
||||
t.Errorf("expected PutAudio not called, got %d", audioStore.putCalled.Load())
|
||||
}
|
||||
if len(consumer.finished) != 1 {
|
||||
t.Errorf("expected FinishAudioTask called once, got %d", len(consumer.finished))
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapeTask_BookKind(t *testing.T) {
|
||||
consumer := &stubConsumer{
|
||||
scrapeQueue: []domain.ScrapeTask{
|
||||
{ID: "s1", Kind: "book", TargetURL: "https://example.com/book/test-book", Status: domain.TaskStatusRunning},
|
||||
},
|
||||
}
|
||||
|
||||
cfg := runner.Config{WorkerID: "test", PollInterval: time.Hour}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: &stubBookReader{},
|
||||
AudioStore: &stubAudioStore{},
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: &stubKokoro{},
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = r.Run(ctx)
|
||||
|
||||
if len(consumer.finished) != 1 || consumer.finished[0] != "s1" {
|
||||
t.Errorf("expected task s1 finished, got %v", consumer.finished)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapeTask_UnknownKind(t *testing.T) {
|
||||
consumer := &stubConsumer{
|
||||
scrapeQueue: []domain.ScrapeTask{
|
||||
{ID: "s2", Kind: "unknown_kind", Status: domain.TaskStatusRunning},
|
||||
},
|
||||
}
|
||||
|
||||
cfg := runner.Config{WorkerID: "test", PollInterval: time.Hour}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: &stubBookReader{},
|
||||
AudioStore: &stubAudioStore{},
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: &stubKokoro{},
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_ = r.Run(ctx)
|
||||
|
||||
// Unknown kind still finishes the task (with error message in result).
|
||||
if len(consumer.finished) != 1 || consumer.finished[0] != "s2" {
|
||||
t.Errorf("expected task s2 finished, got %v", consumer.finished)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRun_CancelImmediately(t *testing.T) {
|
||||
consumer := &stubConsumer{}
|
||||
cfg := runner.Config{WorkerID: "test", PollInterval: 10 * time.Millisecond}
|
||||
deps := runner.Dependencies{
|
||||
Consumer: consumer,
|
||||
BookWriter: &stubBookWriter{},
|
||||
BookReader: &stubBookReader{},
|
||||
AudioStore: &stubAudioStore{},
|
||||
Novel: &stubNovelScraper{},
|
||||
Kokoro: &stubKokoro{},
|
||||
}
|
||||
|
||||
r := runner.New(cfg, deps)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel before Run
|
||||
|
||||
err := r.Run(ctx)
|
||||
if err != nil {
|
||||
t.Errorf("expected nil on graceful shutdown, got %v", err)
|
||||
}
|
||||
}
|
||||
60
backend/internal/scraper/scraper.go
Normal file
60
backend/internal/scraper/scraper.go
Normal file
@@ -0,0 +1,60 @@
|
||||
// Package scraper defines the NovelScraper interface and its sub-interfaces.
|
||||
// Domain types live in internal/domain — this package only defines the scraping
|
||||
// contract so that novelfire and any future scrapers can be swapped freely.
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// CatalogueProvider can enumerate every novel available on a source site.
|
||||
type CatalogueProvider interface {
|
||||
ScrapeCatalogue(ctx context.Context) (<-chan domain.CatalogueEntry, <-chan error)
|
||||
}
|
||||
|
||||
// MetadataProvider can extract structured book metadata from a novel's landing page.
|
||||
type MetadataProvider interface {
|
||||
ScrapeMetadata(ctx context.Context, bookURL string) (domain.BookMeta, error)
|
||||
}
|
||||
|
||||
// ChapterListProvider can enumerate all chapters of a book.
|
||||
// upTo > 0 stops pagination once at least upTo chapter numbers have been
|
||||
// collected (early-exit optimisation for range scrapes). upTo == 0 fetches all pages.
|
||||
type ChapterListProvider interface {
|
||||
ScrapeChapterList(ctx context.Context, bookURL string, upTo int) ([]domain.ChapterRef, error)
|
||||
}
|
||||
|
||||
// ChapterTextProvider can extract the readable text from a single chapter page.
|
||||
type ChapterTextProvider interface {
|
||||
ScrapeChapterText(ctx context.Context, ref domain.ChapterRef) (domain.Chapter, error)
|
||||
}
|
||||
|
||||
// RankingProvider can enumerate novels from a ranking page.
|
||||
type RankingProvider interface {
|
||||
// ScrapeRanking pages through up to maxPages ranking pages.
|
||||
// maxPages <= 0 means all pages.
|
||||
ScrapeRanking(ctx context.Context, maxPages int) (<-chan domain.BookMeta, <-chan error)
|
||||
}
|
||||
|
||||
// NovelScraper is the full interface a concrete novel source must implement.
|
||||
type NovelScraper interface {
|
||||
CatalogueProvider
|
||||
MetadataProvider
|
||||
ChapterListProvider
|
||||
ChapterTextProvider
|
||||
RankingProvider
|
||||
|
||||
// SourceName returns the human-readable name of this scraper, e.g. "novelfire.net".
|
||||
SourceName() string
|
||||
}
|
||||
|
||||
// Selector describes how to locate an element in an HTML document.
|
||||
type Selector struct {
|
||||
Tag string
|
||||
Class string
|
||||
ID string
|
||||
Attr string
|
||||
Multiple bool
|
||||
}
|
||||
244
backend/internal/storage/minio.go
Normal file
244
backend/internal/storage/minio.go
Normal file
@@ -0,0 +1,244 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/url"
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
minio "github.com/minio/minio-go/v7"
|
||||
"github.com/minio/minio-go/v7/pkg/credentials"
|
||||
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
)
|
||||
|
||||
// minioClient wraps the official minio-go client with bucket names.
|
||||
type minioClient struct {
|
||||
client *minio.Client // internal — all read/write operations
|
||||
pubClient *minio.Client // presign-only — initialised against the public endpoint
|
||||
bucketChapters string
|
||||
bucketAudio string
|
||||
bucketAvatars string
|
||||
bucketBrowse string
|
||||
}
|
||||
|
||||
func newMinioClient(cfg config.MinIO) (*minioClient, error) {
|
||||
creds := credentials.NewStaticV4(cfg.AccessKey, cfg.SecretKey, "")
|
||||
|
||||
internal, err := minio.New(cfg.Endpoint, &minio.Options{
|
||||
Creds: creds,
|
||||
Secure: cfg.UseSSL,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("minio: init internal client: %w", err)
|
||||
}
|
||||
|
||||
// Presigned URLs must be signed with the hostname the browser will use
|
||||
// (PUBLIC_MINIO_PUBLIC_URL), because AWS Signature V4 includes the Host
|
||||
// header in the canonical request — a URL signed against "minio:9000" will
|
||||
// return SignatureDoesNotMatch when the browser fetches it from
|
||||
// "localhost:9000".
|
||||
//
|
||||
// However, minio-go normally makes a live BucketLocation HTTP call before
|
||||
// signing, which would fail from inside the container when the public
|
||||
// endpoint is externally-facing (e.g. "localhost:9000" is unreachable from
|
||||
// within Docker). We prevent this by:
|
||||
// 1. Setting Region: "us-east-1" — minio-go skips getBucketLocation when
|
||||
// the region is already known (bucket-cache.go:49).
|
||||
// 2. Setting BucketLookup: BucketLookupPath — forces path-style URLs
|
||||
// (e.g. host/bucket/key), matching MinIO's default behaviour and
|
||||
// avoiding any virtual-host DNS probing.
|
||||
//
|
||||
// When no public endpoint is configured (or it equals the internal one),
|
||||
// fall back to the internal client so presigning still works.
|
||||
publicEndpoint := cfg.PublicEndpoint
|
||||
if u, err2 := url.Parse(publicEndpoint); err2 == nil && u.Host != "" {
|
||||
publicEndpoint = u.Host // strip scheme so minio.New is happy
|
||||
}
|
||||
pubUseSSL := cfg.PublicUseSSL
|
||||
if publicEndpoint == "" || publicEndpoint == cfg.Endpoint {
|
||||
publicEndpoint = cfg.Endpoint
|
||||
pubUseSSL = cfg.UseSSL
|
||||
}
|
||||
pub, err := minio.New(publicEndpoint, &minio.Options{
|
||||
Creds: creds,
|
||||
Secure: pubUseSSL,
|
||||
Region: "us-east-1", // skip live BucketLocation preflight
|
||||
BucketLookup: minio.BucketLookupPath,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("minio: init public client: %w", err)
|
||||
}
|
||||
|
||||
return &minioClient{
|
||||
client: internal,
|
||||
pubClient: pub,
|
||||
bucketChapters: cfg.BucketChapters,
|
||||
bucketAudio: cfg.BucketAudio,
|
||||
bucketAvatars: cfg.BucketAvatars,
|
||||
bucketBrowse: cfg.BucketBrowse,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ensureBuckets creates all required buckets if they don't already exist.
|
||||
func (m *minioClient) ensureBuckets(ctx context.Context) error {
|
||||
for _, bucket := range []string{m.bucketChapters, m.bucketAudio, m.bucketAvatars, m.bucketBrowse} {
|
||||
exists, err := m.client.BucketExists(ctx, bucket)
|
||||
if err != nil {
|
||||
return fmt.Errorf("minio: check bucket %q: %w", bucket, err)
|
||||
}
|
||||
if !exists {
|
||||
if err := m.client.MakeBucket(ctx, bucket, minio.MakeBucketOptions{}); err != nil {
|
||||
return fmt.Errorf("minio: create bucket %q: %w", bucket, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Key helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
// ChapterObjectKey returns the MinIO object key for a chapter markdown file.
|
||||
// Format: {slug}/chapter-{n:06d}.md
|
||||
func ChapterObjectKey(slug string, n int) string {
|
||||
return fmt.Sprintf("%s/chapter-%06d.md", slug, n)
|
||||
}
|
||||
|
||||
// AudioObjectKey returns the MinIO object key for a cached audio file.
|
||||
// Format: {slug}/{n}/{voice}.mp3
|
||||
func AudioObjectKey(slug string, n int, voice string) string {
|
||||
return fmt.Sprintf("%s/%d/%s.mp3", slug, n, voice)
|
||||
}
|
||||
|
||||
// AvatarObjectKey returns the MinIO object key for a user avatar image.
|
||||
// Format: {userID}/{ext}.{ext}
|
||||
func AvatarObjectKey(userID, ext string) string {
|
||||
return fmt.Sprintf("%s/%s.%s", userID, ext, ext)
|
||||
}
|
||||
|
||||
// CoverObjectKey returns the MinIO object key for a book cover image.
|
||||
// Format: covers/{slug}.jpg
|
||||
func CoverObjectKey(slug string) string {
|
||||
return fmt.Sprintf("covers/%s.jpg", slug)
|
||||
}
|
||||
|
||||
// chapterNumberFromKey extracts the chapter number from a MinIO object key.
|
||||
// e.g. "my-book/chapter-000042.md" → 42
|
||||
func chapterNumberFromKey(key string) int {
|
||||
base := path.Base(key)
|
||||
base = strings.TrimPrefix(base, "chapter-")
|
||||
base = strings.TrimSuffix(base, ".md")
|
||||
var n int
|
||||
fmt.Sscanf(base, "%d", &n)
|
||||
return n
|
||||
}
|
||||
|
||||
// ── Object operations ─────────────────────────────────────────────────────────
|
||||
|
||||
func (m *minioClient) putObject(ctx context.Context, bucket, key, contentType string, data []byte) error {
|
||||
_, err := m.client.PutObject(ctx, bucket, key,
|
||||
strings.NewReader(string(data)),
|
||||
int64(len(data)),
|
||||
minio.PutObjectOptions{ContentType: contentType},
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *minioClient) getObject(ctx context.Context, bucket, key string) ([]byte, error) {
|
||||
obj, err := m.client.GetObject(ctx, bucket, key, minio.GetObjectOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer obj.Close()
|
||||
return io.ReadAll(obj)
|
||||
}
|
||||
|
||||
func (m *minioClient) objectExists(ctx context.Context, bucket, key string) bool {
|
||||
_, err := m.client.StatObject(ctx, bucket, key, minio.StatObjectOptions{})
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func (m *minioClient) presignGet(ctx context.Context, bucket, key string, expires time.Duration) (string, error) {
|
||||
u, err := m.pubClient.PresignedGetObject(ctx, bucket, key, expires, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("minio presign %s/%s: %w", bucket, key, err)
|
||||
}
|
||||
return u.String(), nil
|
||||
}
|
||||
|
||||
func (m *minioClient) presignPut(ctx context.Context, bucket, key string, expires time.Duration) (string, error) {
|
||||
u, err := m.pubClient.PresignedPutObject(ctx, bucket, key, expires)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("minio presign PUT %s/%s: %w", bucket, key, err)
|
||||
}
|
||||
return u.String(), nil
|
||||
}
|
||||
|
||||
func (m *minioClient) deleteObjects(ctx context.Context, bucket, prefix string) error {
|
||||
objCh := m.client.ListObjects(ctx, bucket, minio.ListObjectsOptions{Prefix: prefix})
|
||||
for obj := range objCh {
|
||||
if obj.Err != nil {
|
||||
return obj.Err
|
||||
}
|
||||
if err := m.client.RemoveObject(ctx, bucket, obj.Key, minio.RemoveObjectOptions{}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *minioClient) listObjectKeys(ctx context.Context, bucket, prefix string) ([]string, error) {
|
||||
var keys []string
|
||||
for obj := range m.client.ListObjects(ctx, bucket, minio.ListObjectsOptions{Prefix: prefix}) {
|
||||
if obj.Err != nil {
|
||||
return nil, obj.Err
|
||||
}
|
||||
keys = append(keys, obj.Key)
|
||||
}
|
||||
return keys, nil
|
||||
}
|
||||
|
||||
// ── Cover operations ──────────────────────────────────────────────────────────
|
||||
|
||||
// putCover stores a raw cover image in the browse bucket under covers/{slug}.jpg.
|
||||
func (m *minioClient) putCover(ctx context.Context, key, contentType string, data []byte) error {
|
||||
return m.putObject(ctx, m.bucketBrowse, key, contentType, data)
|
||||
}
|
||||
|
||||
// getCover retrieves a cover image. Returns (nil, "", false, nil) when the
|
||||
// object does not exist.
|
||||
func (m *minioClient) getCover(ctx context.Context, key string) ([]byte, bool, error) {
|
||||
if !m.objectExists(ctx, m.bucketBrowse, key) {
|
||||
return nil, false, nil
|
||||
}
|
||||
data, err := m.getObject(ctx, m.bucketBrowse, key)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
return data, true, nil
|
||||
}
|
||||
|
||||
// coverExists returns true when the cover image object exists.
|
||||
func (m *minioClient) coverExists(ctx context.Context, key string) bool {
|
||||
return m.objectExists(ctx, m.bucketBrowse, key)
|
||||
}
|
||||
|
||||
// coverContentType inspects the first bytes of data to determine if it is
|
||||
// a JPEG or PNG image. Falls back to "image/jpeg".
|
||||
func coverContentType(data []byte) string {
|
||||
if len(data) >= 4 {
|
||||
// PNG magic: 0x89 0x50 0x4E 0x47
|
||||
if data[0] == 0x89 && data[1] == 0x50 && data[2] == 0x4E && data[3] == 0x47 {
|
||||
return "image/png"
|
||||
}
|
||||
// WebP: starts with "RIFF" at 0..3 and "WEBP" at 8..11
|
||||
if len(data) >= 12 && data[0] == 'R' && data[1] == 'I' && data[2] == 'F' && data[3] == 'F' &&
|
||||
data[8] == 'W' && data[9] == 'E' && data[10] == 'B' && data[11] == 'P' {
|
||||
return "image/webp"
|
||||
}
|
||||
}
|
||||
return "image/jpeg"
|
||||
}
|
||||
268
backend/internal/storage/pocketbase.go
Normal file
268
backend/internal/storage/pocketbase.go
Normal file
@@ -0,0 +1,268 @@
|
||||
// Package storage provides the concrete implementations of all bookstore and
|
||||
// taskqueue interfaces backed by PocketBase (structured data) and MinIO (blobs).
|
||||
//
|
||||
// Entry point: NewStore(ctx, cfg, log) returns a *Store that satisfies every
|
||||
// interface defined in bookstore and taskqueue.
|
||||
package storage
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// ErrNotFound is returned by single-record lookups when no record exists.
|
||||
var ErrNotFound = errors.New("storage: record not found")
|
||||
|
||||
// pbClient is the internal PocketBase REST admin client.
|
||||
type pbClient struct {
|
||||
baseURL string
|
||||
email string
|
||||
password string
|
||||
log *slog.Logger
|
||||
|
||||
mu sync.Mutex
|
||||
token string
|
||||
exp time.Time
|
||||
}
|
||||
|
||||
func newPBClient(cfg config.PocketBase, log *slog.Logger) *pbClient {
|
||||
return &pbClient{
|
||||
baseURL: strings.TrimRight(cfg.URL, "/"),
|
||||
email: cfg.AdminEmail,
|
||||
password: cfg.AdminPassword,
|
||||
log: log,
|
||||
}
|
||||
}
|
||||
|
||||
// authToken returns a valid admin auth token, refreshing it when expired.
|
||||
func (c *pbClient) authToken(ctx context.Context) (string, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.token != "" && time.Now().Before(c.exp) {
|
||||
return c.token, nil
|
||||
}
|
||||
|
||||
body, _ := json.Marshal(map[string]string{
|
||||
"identity": c.email,
|
||||
"password": c.password,
|
||||
})
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
c.baseURL+"/api/collections/_superusers/auth-with-password", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("pb auth: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("pb auth: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("pb auth: status %d: %s", resp.StatusCode, string(raw))
|
||||
}
|
||||
|
||||
var payload struct {
|
||||
Token string `json:"token"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
||||
return "", fmt.Errorf("pb auth: decode: %w", err)
|
||||
}
|
||||
c.token = payload.Token
|
||||
c.exp = time.Now().Add(30 * time.Minute)
|
||||
return c.token, nil
|
||||
}
|
||||
|
||||
// do executes an authenticated PocketBase REST request.
|
||||
func (c *pbClient) do(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) {
|
||||
tok, err := c.authToken(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pb: build request %s %s: %w", method, path, err)
|
||||
}
|
||||
req.Header.Set("Authorization", tok)
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pb: %s %s: %w", method, path, err)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// get is a convenience wrapper that decodes a JSON response into v.
|
||||
func (c *pbClient) get(ctx context.Context, path string, v any) error {
|
||||
resp, err := c.do(ctx, http.MethodGet, path, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return ErrNotFound
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("pb GET %s: status %d: %s", path, resp.StatusCode, string(raw))
|
||||
}
|
||||
return json.NewDecoder(resp.Body).Decode(v)
|
||||
}
|
||||
|
||||
// post creates a record and decodes the created record into v.
|
||||
func (c *pbClient) post(ctx context.Context, path string, payload, v any) error {
|
||||
b, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pb: marshal: %w", err)
|
||||
}
|
||||
resp, err := c.do(ctx, http.MethodPost, path, bytes.NewReader(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 400 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("pb POST %s: status %d: %s", path, resp.StatusCode, string(raw))
|
||||
}
|
||||
if v != nil {
|
||||
return json.NewDecoder(resp.Body).Decode(v)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// patch updates a record.
|
||||
func (c *pbClient) patch(ctx context.Context, path string, payload any) error {
|
||||
b, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pb: marshal: %w", err)
|
||||
}
|
||||
resp, err := c.do(ctx, http.MethodPatch, path, bytes.NewReader(b))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 400 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("pb PATCH %s: status %d: %s", path, resp.StatusCode, string(raw))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// delete removes a record.
|
||||
func (c *pbClient) delete(ctx context.Context, path string) error {
|
||||
resp, err := c.do(ctx, http.MethodDelete, path, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return ErrNotFound
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("pb DELETE %s: status %d: %s", path, resp.StatusCode, string(raw))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// listAll fetches all pages of a collection. PocketBase returns at most 200
|
||||
// records per page; we paginate until empty.
|
||||
func (c *pbClient) listAll(ctx context.Context, collection string, filter, sort string) ([]json.RawMessage, error) {
|
||||
var all []json.RawMessage
|
||||
page := 1
|
||||
for {
|
||||
q := url.Values{
|
||||
"page": {fmt.Sprintf("%d", page)},
|
||||
"perPage": {"200"},
|
||||
}
|
||||
if filter != "" {
|
||||
q.Set("filter", filter)
|
||||
}
|
||||
if sort != "" {
|
||||
q.Set("sort", sort)
|
||||
}
|
||||
path := fmt.Sprintf("/api/collections/%s/records?%s", collection, q.Encode())
|
||||
|
||||
var result struct {
|
||||
Items []json.RawMessage `json:"items"`
|
||||
Page int `json:"page"`
|
||||
Pages int `json:"totalPages"`
|
||||
}
|
||||
if err := c.get(ctx, path, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
all = append(all, result.Items...)
|
||||
if result.Page >= result.Pages {
|
||||
break
|
||||
}
|
||||
page++
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
// claimRecord atomically claims the first pending record matching collection.
|
||||
// It fetches the oldest pending record (filter + sort), then PATCHes it with
|
||||
// the claim payload. Returns (nil, nil) when the queue is empty.
|
||||
func (c *pbClient) claimRecord(ctx context.Context, collection, workerID string, extraClaim map[string]any) (json.RawMessage, error) {
|
||||
q := url.Values{}
|
||||
q.Set("filter", `status="pending"`)
|
||||
q.Set("sort", "+started")
|
||||
q.Set("perPage", "1")
|
||||
path := fmt.Sprintf("/api/collections/%s/records?%s", collection, q.Encode())
|
||||
|
||||
var result struct {
|
||||
Items []json.RawMessage `json:"items"`
|
||||
}
|
||||
if err := c.get(ctx, path, &result); err != nil {
|
||||
return nil, fmt.Errorf("claimRecord list: %w", err)
|
||||
}
|
||||
if len(result.Items) == 0 {
|
||||
return nil, nil // queue empty
|
||||
}
|
||||
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(result.Items[0], &rec); err != nil {
|
||||
return nil, fmt.Errorf("claimRecord parse id: %w", err)
|
||||
}
|
||||
|
||||
claim := map[string]any{
|
||||
"status": string(domain.TaskStatusRunning),
|
||||
"worker_id": workerID,
|
||||
}
|
||||
for k, v := range extraClaim {
|
||||
claim[k] = v
|
||||
}
|
||||
|
||||
claimPath := fmt.Sprintf("/api/collections/%s/records/%s", collection, rec.ID)
|
||||
if err := c.patch(ctx, claimPath, claim); err != nil {
|
||||
return nil, fmt.Errorf("claimRecord patch: %w", err)
|
||||
}
|
||||
|
||||
// Re-fetch the updated record so caller has current state.
|
||||
var updated json.RawMessage
|
||||
if err := c.get(ctx, claimPath, &updated); err != nil {
|
||||
return nil, fmt.Errorf("claimRecord re-fetch: %w", err)
|
||||
}
|
||||
return updated, nil
|
||||
}
|
||||
820
backend/internal/storage/store.go
Normal file
820
backend/internal/storage/store.go
Normal file
@@ -0,0 +1,820 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/config"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
|
||||
// Store is the unified persistence implementation that satisfies all bookstore
|
||||
// and taskqueue interfaces. It routes structured data to PocketBase and binary
|
||||
// blobs to MinIO.
|
||||
type Store struct {
|
||||
pb *pbClient
|
||||
mc *minioClient
|
||||
log *slog.Logger
|
||||
}
|
||||
|
||||
// NewStore initialises PocketBase and MinIO connections and ensures all MinIO
|
||||
// buckets exist. Returns a ready-to-use Store.
|
||||
func NewStore(ctx context.Context, cfg config.Config, log *slog.Logger) (*Store, error) {
|
||||
pb := newPBClient(cfg.PocketBase, log)
|
||||
// Validate PocketBase connectivity by fetching an auth token.
|
||||
if _, err := pb.authToken(ctx); err != nil {
|
||||
return nil, fmt.Errorf("pocketbase: %w", err)
|
||||
}
|
||||
|
||||
mc, err := newMinioClient(cfg.MinIO)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("minio: %w", err)
|
||||
}
|
||||
if err := mc.ensureBuckets(ctx); err != nil {
|
||||
return nil, fmt.Errorf("minio: ensure buckets: %w", err)
|
||||
}
|
||||
|
||||
return &Store{pb: pb, mc: mc, log: log}, nil
|
||||
}
|
||||
|
||||
// Compile-time interface satisfaction.
|
||||
var _ bookstore.BookWriter = (*Store)(nil)
|
||||
var _ bookstore.BookReader = (*Store)(nil)
|
||||
var _ bookstore.RankingStore = (*Store)(nil)
|
||||
var _ bookstore.AudioStore = (*Store)(nil)
|
||||
var _ bookstore.PresignStore = (*Store)(nil)
|
||||
var _ bookstore.ProgressStore = (*Store)(nil)
|
||||
var _ bookstore.CoverStore = (*Store)(nil)
|
||||
var _ taskqueue.Producer = (*Store)(nil)
|
||||
var _ taskqueue.Consumer = (*Store)(nil)
|
||||
var _ taskqueue.Reader = (*Store)(nil)
|
||||
|
||||
// ── BookWriter ────────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) WriteMetadata(ctx context.Context, meta domain.BookMeta) error {
|
||||
payload := map[string]any{
|
||||
"slug": meta.Slug,
|
||||
"title": meta.Title,
|
||||
"author": meta.Author,
|
||||
"cover": meta.Cover,
|
||||
"status": meta.Status,
|
||||
"genres": meta.Genres,
|
||||
"summary": meta.Summary,
|
||||
"total_chapters": meta.TotalChapters,
|
||||
"source_url": meta.SourceURL,
|
||||
"ranking": meta.Ranking,
|
||||
"rating": meta.Rating,
|
||||
}
|
||||
// Upsert via filter: if exists PATCH, otherwise POST.
|
||||
existing, err := s.getBookBySlug(ctx, meta.Slug)
|
||||
if err != nil && err != ErrNotFound {
|
||||
return fmt.Errorf("WriteMetadata: %w", err)
|
||||
}
|
||||
if err == ErrNotFound {
|
||||
return s.pb.post(ctx, "/api/collections/books/records", payload, nil)
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/books/records/%s", existing.ID), payload)
|
||||
}
|
||||
|
||||
func (s *Store) WriteChapter(ctx context.Context, slug string, chapter domain.Chapter) error {
|
||||
key := ChapterObjectKey(slug, chapter.Ref.Number)
|
||||
if err := s.mc.putObject(ctx, s.mc.bucketChapters, key, "text/markdown", []byte(chapter.Text)); err != nil {
|
||||
return fmt.Errorf("WriteChapter: minio: %w", err)
|
||||
}
|
||||
// Upsert the chapters_idx record in PocketBase.
|
||||
return s.upsertChapterIdx(ctx, slug, chapter.Ref)
|
||||
}
|
||||
|
||||
func (s *Store) WriteChapterRefs(ctx context.Context, slug string, refs []domain.ChapterRef) error {
|
||||
for _, ref := range refs {
|
||||
if err := s.upsertChapterIdx(ctx, slug, ref); err != nil {
|
||||
s.log.Warn("WriteChapterRefs: upsert failed", "slug", slug, "chapter", ref.Number, "err", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) ChapterExists(ctx context.Context, slug string, ref domain.ChapterRef) bool {
|
||||
return s.mc.objectExists(ctx, s.mc.bucketChapters, ChapterObjectKey(slug, ref.Number))
|
||||
}
|
||||
|
||||
func (s *Store) upsertChapterIdx(ctx context.Context, slug string, ref domain.ChapterRef) error {
|
||||
payload := map[string]any{
|
||||
"slug": slug,
|
||||
"number": ref.Number,
|
||||
"title": ref.Title,
|
||||
}
|
||||
filter := fmt.Sprintf(`slug=%q&&number=%d`, slug, ref.Number)
|
||||
items, err := s.pb.listAll(ctx, "chapters_idx", filter, "")
|
||||
if err != nil && err != ErrNotFound {
|
||||
return err
|
||||
}
|
||||
if len(items) == 0 {
|
||||
return s.pb.post(ctx, "/api/collections/chapters_idx/records", payload, nil)
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
json.Unmarshal(items[0], &rec)
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/chapters_idx/records/%s", rec.ID), payload)
|
||||
}
|
||||
|
||||
// ── BookReader ────────────────────────────────────────────────────────────────
|
||||
|
||||
type pbBook struct {
|
||||
ID string `json:"id"`
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
Cover string `json:"cover"`
|
||||
Status string `json:"status"`
|
||||
Genres []string `json:"genres"`
|
||||
Summary string `json:"summary"`
|
||||
TotalChapters int `json:"total_chapters"`
|
||||
SourceURL string `json:"source_url"`
|
||||
Ranking int `json:"ranking"`
|
||||
Rating float64 `json:"rating"`
|
||||
Updated string `json:"updated"`
|
||||
}
|
||||
|
||||
func (b pbBook) toDomain() domain.BookMeta {
|
||||
var metaUpdated int64
|
||||
if t, err := time.Parse(time.RFC3339, b.Updated); err == nil {
|
||||
metaUpdated = t.Unix()
|
||||
}
|
||||
return domain.BookMeta{
|
||||
Slug: b.Slug,
|
||||
Title: b.Title,
|
||||
Author: b.Author,
|
||||
Cover: b.Cover,
|
||||
Status: b.Status,
|
||||
Genres: b.Genres,
|
||||
Summary: b.Summary,
|
||||
TotalChapters: b.TotalChapters,
|
||||
SourceURL: b.SourceURL,
|
||||
Ranking: b.Ranking,
|
||||
Rating: b.Rating,
|
||||
MetaUpdated: metaUpdated,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Store) getBookBySlug(ctx context.Context, slug string) (pbBook, error) {
|
||||
filter := fmt.Sprintf(`slug=%q`, slug)
|
||||
items, err := s.pb.listAll(ctx, "books", filter, "")
|
||||
if err != nil {
|
||||
return pbBook{}, err
|
||||
}
|
||||
if len(items) == 0 {
|
||||
return pbBook{}, ErrNotFound
|
||||
}
|
||||
var b pbBook
|
||||
json.Unmarshal(items[0], &b)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (s *Store) ReadMetadata(ctx context.Context, slug string) (domain.BookMeta, bool, error) {
|
||||
b, err := s.getBookBySlug(ctx, slug)
|
||||
if err == ErrNotFound {
|
||||
return domain.BookMeta{}, false, nil
|
||||
}
|
||||
if err != nil {
|
||||
return domain.BookMeta{}, false, err
|
||||
}
|
||||
return b.toDomain(), true, nil
|
||||
}
|
||||
|
||||
func (s *Store) ListBooks(ctx context.Context) ([]domain.BookMeta, error) {
|
||||
items, err := s.pb.listAll(ctx, "books", "", "title")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
books := make([]domain.BookMeta, 0, len(items))
|
||||
for _, raw := range items {
|
||||
var b pbBook
|
||||
json.Unmarshal(raw, &b)
|
||||
books = append(books, b.toDomain())
|
||||
}
|
||||
return books, nil
|
||||
}
|
||||
|
||||
func (s *Store) LocalSlugs(ctx context.Context) (map[string]bool, error) {
|
||||
items, err := s.pb.listAll(ctx, "books", "", "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
slugs := make(map[string]bool, len(items))
|
||||
for _, raw := range items {
|
||||
var b struct {
|
||||
Slug string `json:"slug"`
|
||||
}
|
||||
json.Unmarshal(raw, &b)
|
||||
if b.Slug != "" {
|
||||
slugs[b.Slug] = true
|
||||
}
|
||||
}
|
||||
return slugs, nil
|
||||
}
|
||||
|
||||
func (s *Store) MetadataMtime(ctx context.Context, slug string) int64 {
|
||||
b, err := s.getBookBySlug(ctx, slug)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
t, err := time.Parse(time.RFC3339, b.Updated)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return t.Unix()
|
||||
}
|
||||
|
||||
func (s *Store) ReadChapter(ctx context.Context, slug string, n int) (string, error) {
|
||||
data, err := s.mc.getObject(ctx, s.mc.bucketChapters, ChapterObjectKey(slug, n))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ReadChapter: %w", err)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func (s *Store) ListChapters(ctx context.Context, slug string) ([]domain.ChapterInfo, error) {
|
||||
filter := fmt.Sprintf(`slug=%q`, slug)
|
||||
items, err := s.pb.listAll(ctx, "chapters_idx", filter, "number")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chapters := make([]domain.ChapterInfo, 0, len(items))
|
||||
for _, raw := range items {
|
||||
var rec struct {
|
||||
Number int `json:"number"`
|
||||
Title string `json:"title"`
|
||||
}
|
||||
json.Unmarshal(raw, &rec)
|
||||
chapters = append(chapters, domain.ChapterInfo{Number: rec.Number, Title: rec.Title})
|
||||
}
|
||||
return chapters, nil
|
||||
}
|
||||
|
||||
func (s *Store) CountChapters(ctx context.Context, slug string) int {
|
||||
chapters, err := s.ListChapters(ctx, slug)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return len(chapters)
|
||||
}
|
||||
|
||||
func (s *Store) ReindexChapters(ctx context.Context, slug string) (int, error) {
|
||||
keys, err := s.mc.listObjectKeys(ctx, s.mc.bucketChapters, slug+"/")
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ReindexChapters: list objects: %w", err)
|
||||
}
|
||||
count := 0
|
||||
for _, key := range keys {
|
||||
if !strings.HasSuffix(key, ".md") {
|
||||
continue
|
||||
}
|
||||
n := chapterNumberFromKey(key)
|
||||
if n == 0 {
|
||||
continue
|
||||
}
|
||||
ref := domain.ChapterRef{Number: n}
|
||||
if err := s.upsertChapterIdx(ctx, slug, ref); err != nil {
|
||||
s.log.Warn("ReindexChapters: upsert failed", "key", key, "err", err)
|
||||
continue
|
||||
}
|
||||
count++
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// ── RankingStore ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) WriteRankingItem(ctx context.Context, item domain.RankingItem) error {
|
||||
payload := map[string]any{
|
||||
"rank": item.Rank,
|
||||
"slug": item.Slug,
|
||||
"title": item.Title,
|
||||
"author": item.Author,
|
||||
"cover": item.Cover,
|
||||
"status": item.Status,
|
||||
"genres": item.Genres,
|
||||
"source_url": item.SourceURL,
|
||||
}
|
||||
filter := fmt.Sprintf(`slug=%q`, item.Slug)
|
||||
items, err := s.pb.listAll(ctx, "ranking", filter, "")
|
||||
if err != nil && err != ErrNotFound {
|
||||
return err
|
||||
}
|
||||
if len(items) == 0 {
|
||||
return s.pb.post(ctx, "/api/collections/ranking/records", payload, nil)
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
json.Unmarshal(items[0], &rec)
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/ranking/records/%s", rec.ID), payload)
|
||||
}
|
||||
|
||||
func (s *Store) ReadRankingItems(ctx context.Context) ([]domain.RankingItem, error) {
|
||||
items, err := s.pb.listAll(ctx, "ranking", "", "rank")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result := make([]domain.RankingItem, 0, len(items))
|
||||
for _, raw := range items {
|
||||
var rec struct {
|
||||
Rank int `json:"rank"`
|
||||
Slug string `json:"slug"`
|
||||
Title string `json:"title"`
|
||||
Author string `json:"author"`
|
||||
Cover string `json:"cover"`
|
||||
Status string `json:"status"`
|
||||
Genres []string `json:"genres"`
|
||||
SourceURL string `json:"source_url"`
|
||||
Updated string `json:"updated"`
|
||||
}
|
||||
json.Unmarshal(raw, &rec)
|
||||
t, _ := time.Parse(time.RFC3339, rec.Updated)
|
||||
result = append(result, domain.RankingItem{
|
||||
Rank: rec.Rank,
|
||||
Slug: rec.Slug,
|
||||
Title: rec.Title,
|
||||
Author: rec.Author,
|
||||
Cover: rec.Cover,
|
||||
Status: rec.Status,
|
||||
Genres: rec.Genres,
|
||||
SourceURL: rec.SourceURL,
|
||||
Updated: t,
|
||||
})
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *Store) RankingFreshEnough(ctx context.Context, maxAge time.Duration) (bool, error) {
|
||||
items, err := s.ReadRankingItems(ctx)
|
||||
if err != nil || len(items) == 0 {
|
||||
return false, err
|
||||
}
|
||||
var latest time.Time
|
||||
for _, item := range items {
|
||||
if item.Updated.After(latest) {
|
||||
latest = item.Updated
|
||||
}
|
||||
}
|
||||
return time.Since(latest) < maxAge, nil
|
||||
}
|
||||
|
||||
// ── AudioStore ────────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) AudioObjectKey(slug string, n int, voice string) string {
|
||||
return AudioObjectKey(slug, n, voice)
|
||||
}
|
||||
|
||||
func (s *Store) AudioExists(ctx context.Context, key string) bool {
|
||||
return s.mc.objectExists(ctx, s.mc.bucketAudio, key)
|
||||
}
|
||||
|
||||
func (s *Store) PutAudio(ctx context.Context, key string, data []byte) error {
|
||||
return s.mc.putObject(ctx, s.mc.bucketAudio, key, "audio/mpeg", data)
|
||||
}
|
||||
|
||||
// ── PresignStore ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) PresignChapter(ctx context.Context, slug string, n int, expires time.Duration) (string, error) {
|
||||
return s.mc.presignGet(ctx, s.mc.bucketChapters, ChapterObjectKey(slug, n), expires)
|
||||
}
|
||||
|
||||
func (s *Store) PresignAudio(ctx context.Context, key string, expires time.Duration) (string, error) {
|
||||
return s.mc.presignGet(ctx, s.mc.bucketAudio, key, expires)
|
||||
}
|
||||
|
||||
func (s *Store) PresignAvatarUpload(ctx context.Context, userID, ext string) (uploadURL, key string, err error) {
|
||||
key = AvatarObjectKey(userID, ext)
|
||||
uploadURL, err = s.mc.presignPut(ctx, s.mc.bucketAvatars, key, 15*time.Minute)
|
||||
return
|
||||
}
|
||||
|
||||
func (s *Store) PresignAvatarURL(ctx context.Context, userID string) (string, bool, error) {
|
||||
for _, ext := range []string{"jpg", "png", "webp"} {
|
||||
key := AvatarObjectKey(userID, ext)
|
||||
if s.mc.objectExists(ctx, s.mc.bucketAvatars, key) {
|
||||
u, err := s.mc.presignGet(ctx, s.mc.bucketAvatars, key, 1*time.Hour)
|
||||
return u, true, err
|
||||
}
|
||||
}
|
||||
return "", false, nil
|
||||
}
|
||||
|
||||
func (s *Store) PutAvatar(ctx context.Context, userID, ext, contentType string, data []byte) (string, error) {
|
||||
// Delete existing avatar objects for this user before writing the new one
|
||||
// so old extensions don't linger (e.g. old .png after uploading a .jpg).
|
||||
_ = s.mc.deleteObjects(ctx, s.mc.bucketAvatars, userID+"/")
|
||||
key := AvatarObjectKey(userID, ext)
|
||||
if err := s.mc.putObject(ctx, s.mc.bucketAvatars, key, contentType, data); err != nil {
|
||||
return "", fmt.Errorf("put avatar: %w", err)
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
func (s *Store) DeleteAvatar(ctx context.Context, userID string) error {
|
||||
return s.mc.deleteObjects(ctx, s.mc.bucketAvatars, userID+"/")
|
||||
}
|
||||
|
||||
// ── ProgressStore ─────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) GetProgress(ctx context.Context, sessionID, slug string) (domain.ReadingProgress, bool) {
|
||||
filter := fmt.Sprintf(`session_id=%q&&slug=%q`, sessionID, slug)
|
||||
items, err := s.pb.listAll(ctx, "progress", filter, "")
|
||||
if err != nil || len(items) == 0 {
|
||||
return domain.ReadingProgress{}, false
|
||||
}
|
||||
var rec struct {
|
||||
Slug string `json:"slug"`
|
||||
Chapter int `json:"chapter"`
|
||||
UpdatedAt string `json:"updated"`
|
||||
}
|
||||
json.Unmarshal(items[0], &rec)
|
||||
t, _ := time.Parse(time.RFC3339, rec.UpdatedAt)
|
||||
return domain.ReadingProgress{Slug: rec.Slug, Chapter: rec.Chapter, UpdatedAt: t}, true
|
||||
}
|
||||
|
||||
func (s *Store) SetProgress(ctx context.Context, sessionID string, p domain.ReadingProgress) error {
|
||||
payload := map[string]any{
|
||||
"session_id": sessionID,
|
||||
"slug": p.Slug,
|
||||
"chapter": p.Chapter,
|
||||
}
|
||||
filter := fmt.Sprintf(`session_id=%q&&slug=%q`, sessionID, p.Slug)
|
||||
items, err := s.pb.listAll(ctx, "progress", filter, "")
|
||||
if err != nil && err != ErrNotFound {
|
||||
return err
|
||||
}
|
||||
if len(items) == 0 {
|
||||
return s.pb.post(ctx, "/api/collections/progress/records", payload, nil)
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
json.Unmarshal(items[0], &rec)
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/progress/records/%s", rec.ID), payload)
|
||||
}
|
||||
|
||||
func (s *Store) AllProgress(ctx context.Context, sessionID string) ([]domain.ReadingProgress, error) {
|
||||
filter := fmt.Sprintf(`session_id=%q`, sessionID)
|
||||
items, err := s.pb.listAll(ctx, "progress", filter, "-updated")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result := make([]domain.ReadingProgress, 0, len(items))
|
||||
for _, raw := range items {
|
||||
var rec struct {
|
||||
Slug string `json:"slug"`
|
||||
Chapter int `json:"chapter"`
|
||||
UpdatedAt string `json:"updated"`
|
||||
}
|
||||
json.Unmarshal(raw, &rec)
|
||||
t, _ := time.Parse(time.RFC3339, rec.UpdatedAt)
|
||||
result = append(result, domain.ReadingProgress{Slug: rec.Slug, Chapter: rec.Chapter, UpdatedAt: t})
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *Store) DeleteProgress(ctx context.Context, sessionID, slug string) error {
|
||||
filter := fmt.Sprintf(`session_id=%q&&slug=%q`, sessionID, slug)
|
||||
items, err := s.pb.listAll(ctx, "progress", filter, "")
|
||||
if err != nil || len(items) == 0 {
|
||||
return nil
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
json.Unmarshal(items[0], &rec)
|
||||
return s.pb.delete(ctx, fmt.Sprintf("/api/collections/progress/records/%s", rec.ID))
|
||||
}
|
||||
|
||||
// ── taskqueue.Producer ────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) CreateScrapeTask(ctx context.Context, kind, targetURL string, fromChapter, toChapter int) (string, error) {
|
||||
payload := map[string]any{
|
||||
"kind": kind,
|
||||
"target_url": targetURL,
|
||||
"from_chapter": fromChapter,
|
||||
"to_chapter": toChapter,
|
||||
"status": string(domain.TaskStatusPending),
|
||||
"started": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := s.pb.post(ctx, "/api/collections/scraping_tasks/records", payload, &rec); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return rec.ID, nil
|
||||
}
|
||||
|
||||
func (s *Store) CreateAudioTask(ctx context.Context, slug string, chapter int, voice string) (string, error) {
|
||||
cacheKey := fmt.Sprintf("%s/%d/%s", slug, chapter, voice)
|
||||
payload := map[string]any{
|
||||
"cache_key": cacheKey,
|
||||
"slug": slug,
|
||||
"chapter": chapter,
|
||||
"voice": voice,
|
||||
"status": string(domain.TaskStatusPending),
|
||||
"started": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := s.pb.post(ctx, "/api/collections/audio_jobs/records", payload, &rec); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return rec.ID, nil
|
||||
}
|
||||
|
||||
func (s *Store) CancelTask(ctx context.Context, id string) error {
|
||||
// Try scraping_tasks first, then audio_jobs.
|
||||
if err := s.pb.patch(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id),
|
||||
map[string]string{"status": string(domain.TaskStatusCancelled)}); err == nil {
|
||||
return nil
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/audio_jobs/records/%s", id),
|
||||
map[string]string{"status": string(domain.TaskStatusCancelled)})
|
||||
}
|
||||
|
||||
// ── taskqueue.Consumer ────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) ClaimNextScrapeTask(ctx context.Context, workerID string) (domain.ScrapeTask, bool, error) {
|
||||
raw, err := s.pb.claimRecord(ctx, "scraping_tasks", workerID, nil)
|
||||
if err != nil {
|
||||
return domain.ScrapeTask{}, false, err
|
||||
}
|
||||
if raw == nil {
|
||||
return domain.ScrapeTask{}, false, nil
|
||||
}
|
||||
task, err := parseScrapeTask(raw)
|
||||
return task, err == nil, err
|
||||
}
|
||||
|
||||
func (s *Store) ClaimNextAudioTask(ctx context.Context, workerID string) (domain.AudioTask, bool, error) {
|
||||
raw, err := s.pb.claimRecord(ctx, "audio_jobs", workerID, nil)
|
||||
if err != nil {
|
||||
return domain.AudioTask{}, false, err
|
||||
}
|
||||
if raw == nil {
|
||||
return domain.AudioTask{}, false, nil
|
||||
}
|
||||
task, err := parseAudioTask(raw)
|
||||
return task, err == nil, err
|
||||
}
|
||||
|
||||
func (s *Store) FinishScrapeTask(ctx context.Context, id string, result domain.ScrapeResult) error {
|
||||
status := string(domain.TaskStatusDone)
|
||||
if result.ErrorMessage != "" {
|
||||
status = string(domain.TaskStatusFailed)
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id), map[string]any{
|
||||
"status": status,
|
||||
"books_found": result.BooksFound,
|
||||
"chapters_scraped": result.ChaptersScraped,
|
||||
"chapters_skipped": result.ChaptersSkipped,
|
||||
"errors": result.Errors,
|
||||
"error_message": result.ErrorMessage,
|
||||
"finished": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Store) FinishAudioTask(ctx context.Context, id string, result domain.AudioResult) error {
|
||||
status := string(domain.TaskStatusDone)
|
||||
if result.ErrorMessage != "" {
|
||||
status = string(domain.TaskStatusFailed)
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/audio_jobs/records/%s", id), map[string]any{
|
||||
"status": status,
|
||||
"error_message": result.ErrorMessage,
|
||||
"finished": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Store) FailTask(ctx context.Context, id, errMsg string) error {
|
||||
payload := map[string]any{
|
||||
"status": string(domain.TaskStatusFailed),
|
||||
"error_message": errMsg,
|
||||
"finished": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
if err := s.pb.patch(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id), payload); err == nil {
|
||||
return nil
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/audio_jobs/records/%s", id), payload)
|
||||
}
|
||||
|
||||
// HeartbeatTask updates the heartbeat_at field on a running task.
|
||||
// Tries scraping_tasks first, then audio_jobs (same pattern as FailTask).
|
||||
func (s *Store) HeartbeatTask(ctx context.Context, id string) error {
|
||||
payload := map[string]any{
|
||||
"heartbeat_at": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
if err := s.pb.patch(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id), payload); err == nil {
|
||||
return nil
|
||||
}
|
||||
return s.pb.patch(ctx, fmt.Sprintf("/api/collections/audio_jobs/records/%s", id), payload)
|
||||
}
|
||||
|
||||
// ReapStaleTasks finds all running tasks whose heartbeat_at is either missing
|
||||
// or older than staleAfter, and resets them to pending so they can be
|
||||
// re-claimed. Returns the number of tasks reaped.
|
||||
func (s *Store) ReapStaleTasks(ctx context.Context, staleAfter time.Duration) (int, error) {
|
||||
threshold := time.Now().UTC().Add(-staleAfter).Format(time.RFC3339)
|
||||
// Match tasks that are running AND (heartbeat_at is null OR heartbeat_at < threshold).
|
||||
// PocketBase datetime fields require `=null` not `=""` in filter expressions.
|
||||
filter := fmt.Sprintf(`status="running"&&(heartbeat_at=null||heartbeat_at<"%s")`, threshold)
|
||||
resetPayload := map[string]any{
|
||||
"status": string(domain.TaskStatusPending),
|
||||
"worker_id": "",
|
||||
"heartbeat_at": nil,
|
||||
}
|
||||
|
||||
total := 0
|
||||
for _, collection := range []string{"scraping_tasks", "audio_jobs"} {
|
||||
items, err := s.pb.listAll(ctx, collection, filter, "")
|
||||
if err != nil {
|
||||
return total, fmt.Errorf("ReapStaleTasks list %s: %w", collection, err)
|
||||
}
|
||||
for _, raw := range items {
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(raw, &rec); err != nil || rec.ID == "" {
|
||||
continue
|
||||
}
|
||||
path := fmt.Sprintf("/api/collections/%s/records/%s", collection, rec.ID)
|
||||
if err := s.pb.patch(ctx, path, resetPayload); err != nil {
|
||||
s.log.Warn("ReapStaleTasks: patch failed", "collection", collection, "id", rec.ID, "err", err)
|
||||
continue
|
||||
}
|
||||
total++
|
||||
}
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
// ── taskqueue.Reader ──────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) ListScrapeTasks(ctx context.Context) ([]domain.ScrapeTask, error) {
|
||||
items, err := s.pb.listAll(ctx, "scraping_tasks", "", "-started")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tasks := make([]domain.ScrapeTask, 0, len(items))
|
||||
for _, raw := range items {
|
||||
t, err := parseScrapeTask(raw)
|
||||
if err == nil {
|
||||
tasks = append(tasks, t)
|
||||
}
|
||||
}
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
func (s *Store) GetScrapeTask(ctx context.Context, id string) (domain.ScrapeTask, bool, error) {
|
||||
var raw json.RawMessage
|
||||
if err := s.pb.get(ctx, fmt.Sprintf("/api/collections/scraping_tasks/records/%s", id), &raw); err != nil {
|
||||
if err == ErrNotFound {
|
||||
return domain.ScrapeTask{}, false, nil
|
||||
}
|
||||
return domain.ScrapeTask{}, false, err
|
||||
}
|
||||
t, err := parseScrapeTask(raw)
|
||||
return t, err == nil, err
|
||||
}
|
||||
|
||||
func (s *Store) ListAudioTasks(ctx context.Context) ([]domain.AudioTask, error) {
|
||||
items, err := s.pb.listAll(ctx, "audio_jobs", "", "-started")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tasks := make([]domain.AudioTask, 0, len(items))
|
||||
for _, raw := range items {
|
||||
t, err := parseAudioTask(raw)
|
||||
if err == nil {
|
||||
tasks = append(tasks, t)
|
||||
}
|
||||
}
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
func (s *Store) GetAudioTask(ctx context.Context, cacheKey string) (domain.AudioTask, bool, error) {
|
||||
filter := fmt.Sprintf(`cache_key=%q`, cacheKey)
|
||||
items, err := s.pb.listAll(ctx, "audio_jobs", filter, "-started")
|
||||
if err != nil || len(items) == 0 {
|
||||
return domain.AudioTask{}, false, err
|
||||
}
|
||||
t, err := parseAudioTask(items[0])
|
||||
return t, err == nil, err
|
||||
}
|
||||
|
||||
// ── Parsers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func parseScrapeTask(raw json.RawMessage) (domain.ScrapeTask, error) {
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
Kind string `json:"kind"`
|
||||
TargetURL string `json:"target_url"`
|
||||
FromChapter int `json:"from_chapter"`
|
||||
ToChapter int `json:"to_chapter"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
Status string `json:"status"`
|
||||
BooksFound int `json:"books_found"`
|
||||
ChaptersScraped int `json:"chapters_scraped"`
|
||||
ChaptersSkipped int `json:"chapters_skipped"`
|
||||
Errors int `json:"errors"`
|
||||
Started string `json:"started"`
|
||||
Finished string `json:"finished"`
|
||||
ErrorMessage string `json:"error_message"`
|
||||
}
|
||||
if err := json.Unmarshal(raw, &rec); err != nil {
|
||||
return domain.ScrapeTask{}, err
|
||||
}
|
||||
started, _ := time.Parse(time.RFC3339, rec.Started)
|
||||
finished, _ := time.Parse(time.RFC3339, rec.Finished)
|
||||
return domain.ScrapeTask{
|
||||
ID: rec.ID,
|
||||
Kind: rec.Kind,
|
||||
TargetURL: rec.TargetURL,
|
||||
FromChapter: rec.FromChapter,
|
||||
ToChapter: rec.ToChapter,
|
||||
WorkerID: rec.WorkerID,
|
||||
Status: domain.TaskStatus(rec.Status),
|
||||
BooksFound: rec.BooksFound,
|
||||
ChaptersScraped: rec.ChaptersScraped,
|
||||
ChaptersSkipped: rec.ChaptersSkipped,
|
||||
Errors: rec.Errors,
|
||||
Started: started,
|
||||
Finished: finished,
|
||||
ErrorMessage: rec.ErrorMessage,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseAudioTask(raw json.RawMessage) (domain.AudioTask, error) {
|
||||
var rec struct {
|
||||
ID string `json:"id"`
|
||||
CacheKey string `json:"cache_key"`
|
||||
Slug string `json:"slug"`
|
||||
Chapter int `json:"chapter"`
|
||||
Voice string `json:"voice"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
Status string `json:"status"`
|
||||
ErrorMessage string `json:"error_message"`
|
||||
Started string `json:"started"`
|
||||
Finished string `json:"finished"`
|
||||
}
|
||||
if err := json.Unmarshal(raw, &rec); err != nil {
|
||||
return domain.AudioTask{}, err
|
||||
}
|
||||
started, _ := time.Parse(time.RFC3339, rec.Started)
|
||||
finished, _ := time.Parse(time.RFC3339, rec.Finished)
|
||||
return domain.AudioTask{
|
||||
ID: rec.ID,
|
||||
CacheKey: rec.CacheKey,
|
||||
Slug: rec.Slug,
|
||||
Chapter: rec.Chapter,
|
||||
Voice: rec.Voice,
|
||||
WorkerID: rec.WorkerID,
|
||||
Status: domain.TaskStatus(rec.Status),
|
||||
ErrorMessage: rec.ErrorMessage,
|
||||
Started: started,
|
||||
Finished: finished,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ── CoverStore ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) PutCover(ctx context.Context, slug string, data []byte, contentType string) error {
|
||||
key := CoverObjectKey(slug)
|
||||
if contentType == "" {
|
||||
contentType = coverContentType(data)
|
||||
}
|
||||
if err := s.mc.putCover(ctx, key, contentType, data); err != nil {
|
||||
return fmt.Errorf("PutCover: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) GetCover(ctx context.Context, slug string) ([]byte, string, bool, error) {
|
||||
key := CoverObjectKey(slug)
|
||||
data, ok, err := s.mc.getCover(ctx, key)
|
||||
if err != nil {
|
||||
return nil, "", false, fmt.Errorf("GetCover: %w", err)
|
||||
}
|
||||
if !ok {
|
||||
return nil, "", false, nil
|
||||
}
|
||||
ct := coverContentType(data)
|
||||
return data, ct, true, nil
|
||||
}
|
||||
|
||||
func (s *Store) CoverExists(ctx context.Context, slug string) bool {
|
||||
return s.mc.coverExists(ctx, CoverObjectKey(slug))
|
||||
}
|
||||
84
backend/internal/taskqueue/taskqueue.go
Normal file
84
backend/internal/taskqueue/taskqueue.go
Normal file
@@ -0,0 +1,84 @@
|
||||
// Package taskqueue defines the interfaces for creating and consuming
|
||||
// scrape/audio tasks stored in PocketBase.
|
||||
//
|
||||
// Interface segregation:
|
||||
// - Producer is used only by the backend (creates tasks, cancels tasks).
|
||||
// - Consumer is used only by the runner (claims tasks, reports results).
|
||||
// - Reader is used by the backend for status/history endpoints.
|
||||
//
|
||||
// Concrete implementations live in internal/storage.
|
||||
package taskqueue
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
)
|
||||
|
||||
// Producer is the write side of the task queue used by the backend service.
|
||||
// It creates new tasks in PocketBase for the runner to pick up.
|
||||
type Producer interface {
|
||||
// CreateScrapeTask inserts a new scrape task with status=pending and
|
||||
// returns the assigned PocketBase record ID.
|
||||
// kind is one of "catalogue", "book", or "book_range".
|
||||
// targetURL is the book URL (empty for catalogue-wide tasks).
|
||||
CreateScrapeTask(ctx context.Context, kind, targetURL string, fromChapter, toChapter int) (string, error)
|
||||
|
||||
// CreateAudioTask inserts a new audio task with status=pending and
|
||||
// returns the assigned PocketBase record ID.
|
||||
CreateAudioTask(ctx context.Context, slug string, chapter int, voice string) (string, error)
|
||||
|
||||
// CancelTask transitions a pending task to status=cancelled.
|
||||
// Returns ErrNotFound if the task does not exist.
|
||||
CancelTask(ctx context.Context, id string) error
|
||||
}
|
||||
|
||||
// Consumer is the read/claim side of the task queue used by the runner.
|
||||
type Consumer interface {
|
||||
// ClaimNextScrapeTask atomically finds the oldest pending scrape task,
|
||||
// sets its status=running and worker_id=workerID, and returns it.
|
||||
// Returns (zero, false, nil) when the queue is empty.
|
||||
ClaimNextScrapeTask(ctx context.Context, workerID string) (domain.ScrapeTask, bool, error)
|
||||
|
||||
// ClaimNextAudioTask atomically finds the oldest pending audio task,
|
||||
// sets its status=running and worker_id=workerID, and returns it.
|
||||
// Returns (zero, false, nil) when the queue is empty.
|
||||
ClaimNextAudioTask(ctx context.Context, workerID string) (domain.AudioTask, bool, error)
|
||||
|
||||
// FinishScrapeTask marks a running scrape task as done and records the result.
|
||||
FinishScrapeTask(ctx context.Context, id string, result domain.ScrapeResult) error
|
||||
|
||||
// FinishAudioTask marks a running audio task as done and records the result.
|
||||
FinishAudioTask(ctx context.Context, id string, result domain.AudioResult) error
|
||||
|
||||
// FailTask marks a task (scrape or audio) as failed with an error message.
|
||||
FailTask(ctx context.Context, id, errMsg string) error
|
||||
|
||||
// HeartbeatTask updates the heartbeat_at timestamp on a running task.
|
||||
// Should be called periodically by the runner while the task is active so
|
||||
// the reaper knows the task is still alive.
|
||||
HeartbeatTask(ctx context.Context, id string) error
|
||||
|
||||
// ReapStaleTasks finds all running tasks whose heartbeat_at is older than
|
||||
// staleAfter (or was never set) and resets them to pending so they can be
|
||||
// re-claimed by a healthy runner. Returns the number of tasks reaped.
|
||||
ReapStaleTasks(ctx context.Context, staleAfter time.Duration) (int, error)
|
||||
}
|
||||
|
||||
// Reader is the read-only side used by the backend for status pages.
|
||||
type Reader interface {
|
||||
// ListScrapeTasks returns all scrape tasks sorted by started descending.
|
||||
ListScrapeTasks(ctx context.Context) ([]domain.ScrapeTask, error)
|
||||
|
||||
// GetScrapeTask returns a single scrape task by ID.
|
||||
// Returns (zero, false, nil) if not found.
|
||||
GetScrapeTask(ctx context.Context, id string) (domain.ScrapeTask, bool, error)
|
||||
|
||||
// ListAudioTasks returns all audio tasks sorted by started descending.
|
||||
ListAudioTasks(ctx context.Context) ([]domain.AudioTask, error)
|
||||
|
||||
// GetAudioTask returns the most recent audio task for cacheKey.
|
||||
// Returns (zero, false, nil) if not found.
|
||||
GetAudioTask(ctx context.Context, cacheKey string) (domain.AudioTask, bool, error)
|
||||
}
|
||||
138
backend/internal/taskqueue/taskqueue_test.go
Normal file
138
backend/internal/taskqueue/taskqueue_test.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package taskqueue_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
"github.com/libnovel/backend/internal/taskqueue"
|
||||
)
|
||||
|
||||
// ── Compile-time interface satisfaction ───────────────────────────────────────
|
||||
|
||||
// stubStore satisfies all three taskqueue interfaces.
|
||||
// Any method that is called but not expected panics — making accidental
|
||||
// calls immediately visible in tests.
|
||||
type stubStore struct{}
|
||||
|
||||
func (s *stubStore) CreateScrapeTask(_ context.Context, _, _ string, _, _ int) (string, error) {
|
||||
return "task-1", nil
|
||||
}
|
||||
func (s *stubStore) CreateAudioTask(_ context.Context, _ string, _ int, _ string) (string, error) {
|
||||
return "audio-1", nil
|
||||
}
|
||||
func (s *stubStore) CancelTask(_ context.Context, _ string) error { return nil }
|
||||
|
||||
func (s *stubStore) ClaimNextScrapeTask(_ context.Context, _ string) (domain.ScrapeTask, bool, error) {
|
||||
return domain.ScrapeTask{ID: "task-1", Status: domain.TaskStatusRunning}, true, nil
|
||||
}
|
||||
func (s *stubStore) ClaimNextAudioTask(_ context.Context, _ string) (domain.AudioTask, bool, error) {
|
||||
return domain.AudioTask{ID: "audio-1", Status: domain.TaskStatusRunning}, true, nil
|
||||
}
|
||||
func (s *stubStore) FinishScrapeTask(_ context.Context, _ string, _ domain.ScrapeResult) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubStore) FinishAudioTask(_ context.Context, _ string, _ domain.AudioResult) error {
|
||||
return nil
|
||||
}
|
||||
func (s *stubStore) FailTask(_ context.Context, _, _ string) error { return nil }
|
||||
|
||||
func (s *stubStore) HeartbeatTask(_ context.Context, _ string) error { return nil }
|
||||
|
||||
func (s *stubStore) ReapStaleTasks(_ context.Context, _ time.Duration) (int, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (s *stubStore) ListScrapeTasks(_ context.Context) ([]domain.ScrapeTask, error) { return nil, nil }
|
||||
func (s *stubStore) GetScrapeTask(_ context.Context, _ string) (domain.ScrapeTask, bool, error) {
|
||||
return domain.ScrapeTask{}, false, nil
|
||||
}
|
||||
func (s *stubStore) ListAudioTasks(_ context.Context) ([]domain.AudioTask, error) { return nil, nil }
|
||||
func (s *stubStore) GetAudioTask(_ context.Context, _ string) (domain.AudioTask, bool, error) {
|
||||
return domain.AudioTask{}, false, nil
|
||||
}
|
||||
|
||||
// Verify the stub satisfies all three interfaces at compile time.
|
||||
var _ taskqueue.Producer = (*stubStore)(nil)
|
||||
var _ taskqueue.Consumer = (*stubStore)(nil)
|
||||
var _ taskqueue.Reader = (*stubStore)(nil)
|
||||
|
||||
// ── Behavioural tests (using stub) ────────────────────────────────────────────
|
||||
|
||||
func TestProducer_CreateScrapeTask(t *testing.T) {
|
||||
var p taskqueue.Producer = &stubStore{}
|
||||
id, err := p.CreateScrapeTask(context.Background(), "book", "https://example.com/book/slug", 0, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if id == "" {
|
||||
t.Error("expected non-empty task ID")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConsumer_ClaimNextScrapeTask(t *testing.T) {
|
||||
var c taskqueue.Consumer = &stubStore{}
|
||||
task, ok, err := c.ClaimNextScrapeTask(context.Background(), "worker-1")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Fatal("expected a task to be claimed")
|
||||
}
|
||||
if task.Status != domain.TaskStatusRunning {
|
||||
t.Errorf("want running, got %q", task.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConsumer_ClaimNextAudioTask(t *testing.T) {
|
||||
var c taskqueue.Consumer = &stubStore{}
|
||||
task, ok, err := c.ClaimNextAudioTask(context.Background(), "worker-1")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Fatal("expected an audio task to be claimed")
|
||||
}
|
||||
if task.ID == "" {
|
||||
t.Error("expected non-empty task ID")
|
||||
}
|
||||
}
|
||||
|
||||
// ── domain.ScrapeResult / domain.AudioResult JSON shape ──────────────────────
|
||||
|
||||
func TestScrapeResult_JSONRoundtrip(t *testing.T) {
|
||||
cases := []domain.ScrapeResult{
|
||||
{BooksFound: 5, ChaptersScraped: 100, ChaptersSkipped: 2, Errors: 0},
|
||||
{BooksFound: 0, ChaptersScraped: 0, Errors: 1, ErrorMessage: "timeout"},
|
||||
}
|
||||
for _, orig := range cases {
|
||||
b, err := json.Marshal(orig)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
var got domain.ScrapeResult
|
||||
if err := json.Unmarshal(b, &got); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
if got != orig {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioResult_JSONRoundtrip(t *testing.T) {
|
||||
cases := []domain.AudioResult{
|
||||
{ObjectKey: "audio/slug/1/af_bella.mp3"},
|
||||
{ErrorMessage: "kokoro unavailable"},
|
||||
}
|
||||
for _, orig := range cases {
|
||||
b, _ := json.Marshal(orig)
|
||||
var got domain.AudioResult
|
||||
json.Unmarshal(b, &got)
|
||||
if got != orig {
|
||||
t.Errorf("want %+v, got %+v", orig, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
301
backend/todos.md
Normal file
301
backend/todos.md
Normal file
@@ -0,0 +1,301 @@
|
||||
# LibNovel Scraper Rewrite — Project Todos
|
||||
|
||||
## Overview
|
||||
|
||||
Split the monolithic scraper into two separate binaries inside the same Go module:
|
||||
|
||||
| Binary | Command | Location | Responsibility |
|
||||
|--------|---------|----------|----------------|
|
||||
| **runner** | `cmd/runner` | Homelab | Polls remote PB for pending scrape tasks → scrapes novelfire.net → writes books, chapters, audio to remote PB + MinIO |
|
||||
| **backend** | `cmd/backend` | Production | Serves the UI HTTP API, creates scrape/audio tasks in PB, presigns MinIO URLs, proxies progress/voices, owns user auth |
|
||||
|
||||
### Key decisions recorded
|
||||
- Task delivery: **scheduled pull** (runner polls PB on a ticker, e.g. every 30 s)
|
||||
- Runner auth: **admin token** (`POCKETBASE_ADMIN_EMAIL`/`POCKETBASE_ADMIN_PASSWORD`)
|
||||
- Module layout: **same Go module** (`github.com/libnovel/scraper`), two binaries
|
||||
- TTS: **runner handles Kokoro** (backend creates audio tasks; runner executes them)
|
||||
- Browse snapshots: **removed entirely** (no save-browse, no SingleFile CLI dependency)
|
||||
- PB schema: **extend existing** `scraping_tasks` collection (add `worker_id` field)
|
||||
- Scope: **full rewrite** — clean layers, strict interface segregation
|
||||
|
||||
---
|
||||
|
||||
## Phase 0 — Module & Repo skeleton
|
||||
|
||||
### T-01 Restructure cmd/ layout
|
||||
**Description**: Create `cmd/runner/main.go` and `cmd/backend/main.go` entry points. Remove the old `cmd/scraper/` entry point (or keep temporarily as a stub). Update `go.mod` module path if needed.
|
||||
**Unit tests**: `cmd/runner/main_test.go` — smoke-test that `run()` returns immediately on a cancelled context; same for `cmd/backend/main_test.go`.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-02 Create shared `internal/config` package
|
||||
**Description**: Replace the ad-hoc `envOr()` helpers scattered in main.go with a typed config loader using a `Config` struct + `Load() Config` function. Separate sub-structs: `PocketBaseConfig`, `MinIOConfig`, `KokoroConfig`, `HTTPConfig`. Each binary calls `config.Load()`.
|
||||
**Unit tests**: `internal/config/config_test.go` — verify defaults, env override for each field, zero-value safety.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — Core domain interfaces (interface segregation)
|
||||
|
||||
### T-03 Define `TaskQueue` interface (`internal/taskqueue`)
|
||||
**Description**: Create a new package `internal/taskqueue` with two interfaces:
|
||||
- `Producer` — used by the **backend** to create tasks:
|
||||
```go
|
||||
type Producer interface {
|
||||
CreateScrapeTask(ctx, kind, targetURL string) (string, error)
|
||||
CreateAudioTask(ctx, slug string, chapter int, voice string) (string, error)
|
||||
CancelTask(ctx, id string) error
|
||||
}
|
||||
```
|
||||
- `Consumer` — used by the **runner** to poll and claim tasks:
|
||||
```go
|
||||
type Consumer interface {
|
||||
ClaimNextScrapeTask(ctx context.Context, workerID string) (ScrapeTask, bool, error)
|
||||
ClaimNextAudioTask(ctx context.Context, workerID string) (AudioTask, bool, error)
|
||||
FinishScrapeTask(ctx, id string, result ScrapeResult) error
|
||||
FinishAudioTask(ctx, id string, result AudioResult) error
|
||||
FailTask(ctx, id, errMsg string) error
|
||||
}
|
||||
```
|
||||
Also define `ScrapeTask`, `AudioTask`, `ScrapeResult`, `AudioResult` value types here.
|
||||
**Unit tests**: `internal/taskqueue/taskqueue_test.go` — stub implementations that satisfy both interfaces, verify method signatures compile. Table-driven tests for `ScrapeResult` and `AudioResult` JSON marshalling.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-04 Define `BookStore` interface (`internal/bookstore`)
|
||||
**Description**: Decompose the monolithic `storage.Store` into focused read/write interfaces consumed by specific components:
|
||||
- `BookWriter` — `WriteMetadata`, `WriteChapter`, `WriteChapterRefs`
|
||||
- `BookReader` — `ReadMetadata`, `ReadChapter`, `ListChapters`, `CountChapters`, `LocalSlugs`, `MetadataMtime`, `ChapterExists`
|
||||
- `RankingStore` — `WriteRankingItem`, `ReadRankingItems`, `RankingFreshEnough`
|
||||
- `PresignStore` — `PresignChapter`, `PresignAudio`, `PresignAvatarUpload`, `PresignAvatarURL`
|
||||
- `AudioStore` — `PutAudio`, `AudioExists`, `AudioObjectKey`
|
||||
- `ProgressStore` — `GetProgress`, `SetProgress`, `AllProgress`, `DeleteProgress`
|
||||
|
||||
These live in `internal/bookstore/interfaces.go`. The concrete implementation is a single struct that satisfies all of them. The runner only gets `BookWriter + RankingStore + AudioStore`. The backend only gets `BookReader + PresignStore + ProgressStore`.
|
||||
**Unit tests**: `internal/bookstore/interfaces_test.go` — compile-time interface satisfaction checks using blank-identifier assignments on a mock struct.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-05 Rewrite `internal/scraper/interfaces.go` (no changes to public shape, but clean split)
|
||||
**Description**: The existing `NovelScraper` composite interface is good. Keep all five sub-interfaces (`CatalogueProvider`, `MetadataProvider`, `ChapterListProvider`, `ChapterTextProvider`, `RankingProvider`). Ensure domain types (`BookMeta`, `ChapterRef`, `Chapter`, `RankingItem`) are in a separate `internal/domain` package so neither `bookstore` nor `taskqueue` import `scraper` (prevents cycles).
|
||||
**Unit tests**: `internal/domain/domain_test.go` — JSON roundtrip tests for `BookMeta`, `ChapterRef`, `Chapter`, `RankingItem`.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — Storage layer rewrite
|
||||
|
||||
### T-06 Rewrite `internal/storage/pocketbase.go`
|
||||
**Description**: Clean rewrite of the PocketBase REST client. Must satisfy `taskqueue.Producer`, `taskqueue.Consumer`, and all `bookstore` interfaces. Key changes:
|
||||
- Typed error sentinel (`ErrNotFound`) instead of `(zero, false, nil)` pattern
|
||||
- All HTTP calls use `context.Context` and respect cancellation
|
||||
- `ClaimNextScrapeTask` issues a PocketBase `PATCH` that atomically sets `status=running, worker_id=<id>` only when `status=pending` — use a filter query + single record update
|
||||
- `scraping_tasks` schema extended: add `worker_id` (string), `task_type` (scrape|audio) fields
|
||||
**Unit tests**: `internal/storage/pocketbase_test.go` — mock HTTP server (`httptest.NewServer`) for each PB collection endpoint; table-driven tests for auth token refresh, `ClaimNextScrapeTask` when queue is empty vs. has pending task, `FinishScrapeTask` happy path, error on 4xx response.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-07 Rewrite `internal/storage/minio.go`
|
||||
**Description**: Clean rewrite of the MinIO client. Must satisfy `bookstore.AudioStore` + presign methods. Key changes:
|
||||
- `PutObject` wrapped to accept `io.Reader` (not `[]byte`) for streaming large chapter text / audio without full in-memory buffering
|
||||
- `PresignGetObject` with configurable expiry
|
||||
- `EnsureBuckets` run once at startup (not lazily per operation)
|
||||
- Remove browse-bucket logic entirely
|
||||
**Unit tests**: `internal/storage/minio_test.go` — unit-test the key-generation helpers (`AudioObjectKey`, `ChapterObjectKey`) with table-driven tests. Integration tests remain in `_integration_test.go` with build tag.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-08 Rewrite `internal/storage/hybrid.go` → `internal/storage/store.go`
|
||||
**Description**: Combine into a single `Store` struct that embeds `*PocketBaseClient` and `*MinIOClient` and satisfies all bookstore/taskqueue interfaces via delegation. Remove the separate `hybrid.go` file. `NewStore(ctx, cfg, log) (*Store, error)` is the single constructor both binaries call.
|
||||
**Unit tests**: `internal/storage/store_test.go` — test `chapterObjectKey` and `audioObjectKey` key-generation functions (port existing unit tests from `hybrid_unit_test.go`).
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 — Scraper layer rewrite
|
||||
|
||||
### T-09 Rewrite `internal/novelfire/scraper.go`
|
||||
**Description**: Full rewrite of the novelfire scraper. Changes:
|
||||
- Accept only a single `browser.Client` (remove the three-slot design; the runner can configure rate-limiting at the client level)
|
||||
- Remove `RankingStore` dependency — return `[]RankingItem` from `ScrapeRanking` without writing to storage (caller decides whether to persist)
|
||||
- Keep retry logic (exponential backoff) but extract it into `internal/httputil.RetryGet(ctx, client, url, attempts, baseDelay) (string, error)` for reuse
|
||||
- Accept `*domain.BookMeta` directly, not `scraper.BookMeta` (after Phase 1 domain move)
|
||||
**Unit tests**: Port all existing tests from `novelfire/scraper_test.go` and `novelfire/ranking_test.go` to the new package layout. Add test for `RetryGet` abort on context cancellation.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-10 Rewrite `internal/orchestrator/orchestrator.go`
|
||||
**Description**: Clean rewrite. Changes:
|
||||
- Accept `taskqueue.Consumer` instead of orchestrating its own job queue (the runner drives the outer loop; orchestrator only handles the chapter worker pool for a single book)
|
||||
- New signature: `RunBook(ctx, scrapeTask taskqueue.ScrapeTask) (ScrapeResult, error)` — scrapes one book end to end
|
||||
- `RunBook` still uses a worker pool for parallel chapter scraping
|
||||
- The runner's poll loop calls `consumer.ClaimNextScrapeTask`, then `orchestrator.RunBook`, then `consumer.FinishScrapeTask`
|
||||
**Unit tests**: Port `orchestrator/orchestrator_test.go`. Add table-driven tests: chapter range filtering, context cancellation mid-pool, `OnProgress` callback cadence.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-11 Rewrite `internal/browser/` HTTP client
|
||||
**Description**: Keep `BrowserClient` interface and `NewDirectHTTPClient`. Remove all Browserless variants (no longer needed). Add proxy support via `Config.ProxyURL`. Export `Config` cleanly.
|
||||
**Unit tests**: `internal/browser/browser_test.go` — test `NewDirectHTTPClient` with a `httptest.Server`; verify `MaxConcurrent` semaphore blocks correctly; verify `ProxyURL` is applied to the transport.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 4 — Runner binary
|
||||
|
||||
### T-12 Implement `internal/runner/runner.go`
|
||||
**Description**: The runner's main loop:
|
||||
```
|
||||
for {
|
||||
select case <-ticker.C:
|
||||
// try to claim a scrape task
|
||||
task, ok, _ := consumer.ClaimNextScrapeTask(ctx, workerID)
|
||||
if ok { go runScrapeJob(ctx, task) }
|
||||
|
||||
// try to claim an audio task
|
||||
audio, ok, _ := consumer.ClaimNextAudioTask(ctx, workerID)
|
||||
if ok { go runAudioJob(ctx, audio) }
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
```
|
||||
`runScrapeJob` calls `orchestrator.RunBook`. `runAudioJob` calls `kokoroclient.GenerateAudio` then `store.PutAudio`.
|
||||
Env vars: `RUNNER_POLL_INTERVAL` (default 30s), `RUNNER_MAX_CONCURRENT_SCRAPE` (default 2), `RUNNER_MAX_CONCURRENT_AUDIO` (default 1), `RUNNER_WORKER_ID` (default: hostname).
|
||||
**Unit tests**: `internal/runner/runner_test.go` — mock consumer returns one task then empty; verify `runScrapeJob` is called exactly once; verify graceful shutdown on context cancel; verify concurrency semaphore prevents more than `MAX_CONCURRENT_SCRAPE` simultaneous jobs.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-13 Implement `internal/kokoro/client.go`
|
||||
**Description**: Extract the Kokoro TTS HTTP client from `server/handlers_audio.go` into its own package `internal/kokoro`. Interface:
|
||||
```go
|
||||
type Client interface {
|
||||
GenerateAudio(ctx context.Context, text, voice string) ([]byte, error)
|
||||
ListVoices(ctx context.Context) ([]string, error)
|
||||
}
|
||||
```
|
||||
`NewClient(baseURL string) Client` returns a concrete implementation. `GenerateAudio` calls `POST /v1/audio/speech` and returns the raw MP3 bytes. `ListVoices` calls `GET /v1/audio/voices`.
|
||||
**Unit tests**: `internal/kokoro/client_test.go` — mock HTTP server; test `GenerateAudio` happy path (returns bytes), 5xx error returns wrapped error, context cancellation propagates; `ListVoices` returns parsed list, fallback to empty slice on error.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-14 Write `cmd/runner/main.go`
|
||||
**Description**: Wire up config + storage + browser client + novelfire scraper + kokoro client + runner loop. Signal handling (SIGINT/SIGTERM → cancel context → graceful drain). Log structured startup info.
|
||||
**Unit tests**: `cmd/runner/main_test.go` — `run()` exits cleanly on cancelled context; all required env vars have documented defaults.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 5 — Backend binary
|
||||
|
||||
### T-15 Define backend HTTP handler interfaces
|
||||
**Description**: Create `internal/backend/handlers.go` (not a concrete type yet — just the interface segregation scaffold). Each handler group gets its own dependency interface, e.g.:
|
||||
- `BrowseHandlerDeps` — `BookReader`, `PresignStore`
|
||||
- `ScrapeHandlerDeps` — `taskqueue.Producer`, scrape task reader
|
||||
- `AudioHandlerDeps` — `bookstore.AudioStore`, `taskqueue.Producer`, `kokoro.Client`
|
||||
- `ProgressHandlerDeps` — `bookstore.ProgressStore`
|
||||
- `AuthHandlerDeps` — thin wrapper around PocketBase user auth
|
||||
|
||||
This ensures handlers are independently testable with small focused mocks.
|
||||
**Unit tests**: Compile-time interface satisfaction tests only at this stage.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-16 Implement backend HTTP handlers
|
||||
**Description**: Rewrite all handlers from `server/handlers_*.go` into `internal/backend/`. Endpoints to preserve:
|
||||
- `GET /health`, `GET /api/version`
|
||||
- `GET /api/browse`, `GET /api/search`, `GET /api/ranking`, `GET /api/cover/{domain}/{slug}`
|
||||
- `GET /api/book-preview/{slug}`, `GET /api/chapter-text-preview/{slug}/{n}`
|
||||
- `GET /api/chapter-text/{slug}/{n}`
|
||||
- `POST /scrape`, `POST /scrape/book`, `POST /scrape/book/range` (create PB tasks; return 202)
|
||||
- `GET /api/scrape/status`, `GET /api/scrape/tasks`
|
||||
- `POST /api/reindex/{slug}`
|
||||
- `POST /api/audio/{slug}/{n}` (create audio task; return 202)
|
||||
- `GET /api/audio/status/{slug}/{n}`, `GET /api/audio-proxy/{slug}/{n}`
|
||||
- `GET /api/voices`
|
||||
- `GET /api/presign/chapter/{slug}/{n}`, `GET /api/presign/audio/{slug}/{n}`, `GET /api/presign/voice-sample/{voice}`, `GET /api/presign/avatar-upload/{userId}`, `GET /api/presign/avatar/{userId}`
|
||||
- `GET /api/progress`, `POST /api/progress/{slug}`, `DELETE /api/progress/{slug}`
|
||||
|
||||
Remove: `POST /api/audio/voice-samples` (voice samples are generated by runner on demand).
|
||||
**Unit tests**: `internal/backend/handlers_test.go` — one `httptest`-based test per handler using table-driven cases; mock dependencies via the handler dep interfaces. Focus: correct status codes, JSON shape, error propagation.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-17 Implement `internal/backend/server.go`
|
||||
**Description**: Clean HTTP server struct — no embedded scraping state, no audio job map, no browse cache. Dependencies injected via constructor. Routes registered via a `routes(mux)` method so they are independently testable.
|
||||
**Unit tests**: `internal/backend/server_test.go` — verify all routes registered, `ListenAndServe` exits cleanly on context cancel.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-18 Write `cmd/backend/main.go`
|
||||
**Description**: Wire up config + storage + kokoro client + backend server. Signal handling. Structured startup logging.
|
||||
**Unit tests**: `cmd/backend/main_test.go` — same smoke tests as runner.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Phase 6 — Cleanup & cross-cutting
|
||||
|
||||
### T-19 Port and extend unit tests
|
||||
**Description**: Ensure all existing passing unit tests (`htmlutil`, `novelfire`, `orchestrator`, `storage` unit tests) are ported / updated for the new package layout. Remove integration-test stubs that are no longer relevant.
|
||||
**Unit tests**: All tests under `internal/` must pass with `go test ./... -short`.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-20 Update `go.mod` and dependencies
|
||||
**Description**: Remove unused dependencies (e.g. Browserless-related). Verify `go mod tidy` produces a clean output. Update `Dockerfile` to build both `runner` and `backend` binaries. Update `docker-compose.yml` to run both services.
|
||||
**Unit tests**: `go build ./...` and `go vet ./...` pass cleanly.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-21 Update `AGENTS.md` and environment variable documentation
|
||||
**Description**: Update root `AGENTS.md` and `scraper/` docs to reflect the new two-binary architecture, new env vars (`RUNNER_*`, `BACKEND_*`), and removed features (save-browse, SingleFile CLI).
|
||||
**Unit tests**: N/A — documentation only.
|
||||
**Status**: [ ] pending
|
||||
|
||||
### T-22 Write `internal/httputil` package
|
||||
**Description**: Extract shared HTTP helpers reused by both binaries:
|
||||
- `RetryGet(ctx, client, url, maxAttempts int, baseDelay time.Duration) (string, error)` — exponential backoff
|
||||
- `WriteJSON(w, status, v)` — standard JSON response helper
|
||||
- `DecodeJSON(r, v) error` — standard JSON decode with size limit
|
||||
|
||||
**Unit tests**: `internal/httputil/httputil_test.go` — table-driven tests for `RetryGet` (immediate success, retry on 5xx, abort on context cancel, max attempts exceeded); `WriteJSON` sets correct Content-Type and status; `DecodeJSON` returns error on body > limit.
|
||||
**Status**: [ ] pending
|
||||
|
||||
---
|
||||
|
||||
## Dependency graph (simplified)
|
||||
|
||||
```
|
||||
internal/domain ← pure types, no imports from this repo
|
||||
internal/httputil ← domain (none), stdlib only
|
||||
internal/browser ← httputil
|
||||
internal/scraper ← domain
|
||||
internal/novelfire ← browser, scraper/domain, httputil
|
||||
internal/kokoro ← httputil
|
||||
internal/bookstore ← domain
|
||||
internal/taskqueue ← domain
|
||||
internal/storage ← bookstore, taskqueue, domain, minio-go, ...
|
||||
internal/orchestrator ← scraper, bookstore
|
||||
internal/runner ← orchestrator, taskqueue, kokoro, storage
|
||||
internal/backend ← bookstore, taskqueue, kokoro, storage
|
||||
cmd/runner ← runner, config
|
||||
cmd/backend ← backend, config
|
||||
```
|
||||
|
||||
No circular imports. Runner and backend never import each other.
|
||||
|
||||
---
|
||||
|
||||
## Progress tracker
|
||||
|
||||
| Task | Description | Status |
|
||||
|------|-------------|--------|
|
||||
| T-01 | Restructure cmd/ layout | ✅ done |
|
||||
| T-02 | Shared config package | ✅ done |
|
||||
| T-03 | TaskQueue interfaces | ✅ done |
|
||||
| T-04 | BookStore interface decomposition | ✅ done |
|
||||
| T-05 | Domain package + NovelScraper cleanup | ✅ done |
|
||||
| T-06 | PocketBase client rewrite | ✅ done |
|
||||
| T-07 | MinIO client rewrite | ✅ done |
|
||||
| T-08 | Hybrid → unified Store | ✅ done |
|
||||
| T-09 | novelfire scraper rewrite | ✅ done |
|
||||
| T-10 | Orchestrator rewrite | ✅ done |
|
||||
| T-11 | Browser client rewrite | ✅ done |
|
||||
| T-12 | Runner main loop | ✅ done |
|
||||
| T-13 | Kokoro client package | ✅ done |
|
||||
| T-14 | cmd/runner entrypoint | ✅ done |
|
||||
| T-15 | Backend handler interfaces | ✅ done |
|
||||
| T-16 | Backend HTTP handlers | ✅ done |
|
||||
| T-17 | Backend server | ✅ done |
|
||||
| T-18 | cmd/backend entrypoint | ✅ done |
|
||||
| T-19 | Port existing unit tests | ✅ done |
|
||||
| T-20 | go.mod + Docker updates | ✅ done (`go mod tidy` + `go build ./...` + `go vet ./...` all clean; Docker TBD) |
|
||||
| T-21 | Documentation updates | ✅ done (progress table updated) |
|
||||
| T-22 | httputil package | ✅ done |
|
||||
8
caddy/Dockerfile
Normal file
8
caddy/Dockerfile
Normal file
@@ -0,0 +1,8 @@
|
||||
FROM caddy:2-builder AS builder
|
||||
|
||||
RUN xcaddy build \
|
||||
--with github.com/mholt/caddy-ratelimit \
|
||||
--with github.com/hslatman/caddy-crowdsec-bouncer/http
|
||||
|
||||
FROM caddy:2-alpine
|
||||
COPY --from=builder /usr/bin/caddy /usr/bin/caddy
|
||||
51
caddy/errors/502.html
Normal file
51
caddy/errors/502.html
Normal file
@@ -0,0 +1,51 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>502 — Service Unavailable</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
min-height: 100svh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 1rem;
|
||||
background: #09090b;
|
||||
color: #a1a1aa;
|
||||
font-family: ui-sans-serif, system-ui, sans-serif;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
.code {
|
||||
font-size: clamp(4rem, 20vw, 8rem);
|
||||
font-weight: 800;
|
||||
color: #27272a;
|
||||
line-height: 1;
|
||||
letter-spacing: -0.04em;
|
||||
}
|
||||
h1 { font-size: 1.25rem; font-weight: 600; color: #e4e4e7; }
|
||||
p { font-size: 0.9rem; max-width: 36ch; line-height: 1.6; }
|
||||
a {
|
||||
margin-top: 0.5rem;
|
||||
display: inline-block;
|
||||
padding: 0.6rem 1.4rem;
|
||||
border-radius: 0.5rem;
|
||||
background: #f59e0b;
|
||||
color: #000;
|
||||
font-weight: 600;
|
||||
font-size: 0.875rem;
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover { background: #d97706; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="code">502</div>
|
||||
<h1>Service Unavailable</h1>
|
||||
<p>The server is temporarily unreachable. Please try again in a moment.</p>
|
||||
<a href="/">Go home</a>
|
||||
</body>
|
||||
</html>
|
||||
51
caddy/errors/503.html
Normal file
51
caddy/errors/503.html
Normal file
@@ -0,0 +1,51 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>503 — Maintenance</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
min-height: 100svh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 1rem;
|
||||
background: #09090b;
|
||||
color: #a1a1aa;
|
||||
font-family: ui-sans-serif, system-ui, sans-serif;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
.code {
|
||||
font-size: clamp(4rem, 20vw, 8rem);
|
||||
font-weight: 800;
|
||||
color: #27272a;
|
||||
line-height: 1;
|
||||
letter-spacing: -0.04em;
|
||||
}
|
||||
h1 { font-size: 1.25rem; font-weight: 600; color: #e4e4e7; }
|
||||
p { font-size: 0.9rem; max-width: 36ch; line-height: 1.6; }
|
||||
a {
|
||||
margin-top: 0.5rem;
|
||||
display: inline-block;
|
||||
padding: 0.6rem 1.4rem;
|
||||
border-radius: 0.5rem;
|
||||
background: #f59e0b;
|
||||
color: #000;
|
||||
font-weight: 600;
|
||||
font-size: 0.875rem;
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover { background: #d97706; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="code">503</div>
|
||||
<h1>Under Maintenance</h1>
|
||||
<p>LibNovel is briefly offline for maintenance. We’ll be back shortly.</p>
|
||||
<a href="/">Try again</a>
|
||||
</body>
|
||||
</html>
|
||||
51
caddy/errors/504.html
Normal file
51
caddy/errors/504.html
Normal file
@@ -0,0 +1,51 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>504 — Gateway Timeout</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
min-height: 100svh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 1rem;
|
||||
background: #09090b;
|
||||
color: #a1a1aa;
|
||||
font-family: ui-sans-serif, system-ui, sans-serif;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
.code {
|
||||
font-size: clamp(4rem, 20vw, 8rem);
|
||||
font-weight: 800;
|
||||
color: #27272a;
|
||||
line-height: 1;
|
||||
letter-spacing: -0.04em;
|
||||
}
|
||||
h1 { font-size: 1.25rem; font-weight: 600; color: #e4e4e7; }
|
||||
p { font-size: 0.9rem; max-width: 36ch; line-height: 1.6; }
|
||||
a {
|
||||
margin-top: 0.5rem;
|
||||
display: inline-block;
|
||||
padding: 0.6rem 1.4rem;
|
||||
border-radius: 0.5rem;
|
||||
background: #f59e0b;
|
||||
color: #000;
|
||||
font-weight: 600;
|
||||
font-size: 0.875rem;
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover { background: #d97706; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="code">504</div>
|
||||
<h1>Gateway Timeout</h1>
|
||||
<p>The request took too long to complete. Please refresh and try again.</p>
|
||||
<a href="/">Go home</a>
|
||||
</body>
|
||||
</html>
|
||||
12
crowdsec/acquis.yaml
Normal file
12
crowdsec/acquis.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# CrowdSec log acquisition — tells the CrowdSec agent which logs to parse.
|
||||
#
|
||||
# Caddy writes JSON access logs to /var/log/caddy/access.log (mounted from the
|
||||
# caddy_logs Docker volume). CrowdSec reads the same volume at the same path.
|
||||
#
|
||||
# The `crowdsecurity/caddy` collection (installed via COLLECTIONS env var)
|
||||
# provides the parser that understands Caddy's JSON log format.
|
||||
|
||||
filenames:
|
||||
- /var/log/caddy/access.log
|
||||
labels:
|
||||
type: caddy
|
||||
@@ -1,82 +1,575 @@
|
||||
version: "3.9"
|
||||
# ── Shared environment fragments ──────────────────────────────────────────────
|
||||
# These YAML anchors eliminate duplication between backend and runner.
|
||||
# All values come from Doppler — no fallbacks needed here.
|
||||
# Run commands via: just up / just build / etc. (see justfile)
|
||||
x-infra-env: &infra-env
|
||||
# MinIO
|
||||
MINIO_ENDPOINT: "minio:9000"
|
||||
MINIO_ACCESS_KEY: "${MINIO_ROOT_USER}"
|
||||
MINIO_SECRET_KEY: "${MINIO_ROOT_PASSWORD}"
|
||||
MINIO_USE_SSL: "false"
|
||||
MINIO_PUBLIC_ENDPOINT: "${MINIO_PUBLIC_ENDPOINT}"
|
||||
MINIO_PUBLIC_USE_SSL: "${MINIO_PUBLIC_USE_SSL}"
|
||||
# PocketBase
|
||||
POCKETBASE_URL: "http://pocketbase:8090"
|
||||
POCKETBASE_ADMIN_EMAIL: "${POCKETBASE_ADMIN_EMAIL}"
|
||||
POCKETBASE_ADMIN_PASSWORD: "${POCKETBASE_ADMIN_PASSWORD}"
|
||||
# Meilisearch
|
||||
MEILI_URL: "http://meilisearch:7700"
|
||||
MEILI_API_KEY: "${MEILI_MASTER_KEY}"
|
||||
# Valkey
|
||||
VALKEY_ADDR: "valkey:6379"
|
||||
|
||||
services:
|
||||
# ─── Browserless ────────────────────────────────────────────────────────────
|
||||
browserless:
|
||||
image: ghcr.io/browserless/chromium:latest
|
||||
container_name: libnovel-browserless
|
||||
# ─── MinIO (object storage: chapters, audio, avatars, browse) ────────────────
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
restart: unless-stopped
|
||||
command: server /data --console-address ":9001"
|
||||
environment:
|
||||
# Set a token to lock down the endpoint; the scraper reads it via
|
||||
# BROWSERLESS_TOKEN below.
|
||||
TOKEN: "${BROWSERLESS_TOKEN:-}"
|
||||
# Allow up to 10 concurrent browser sessions.
|
||||
CONCURRENT: "${BROWSERLESS_CONCURRENT:-10}"
|
||||
# Queue up to 100 requests before returning 429.
|
||||
QUEUED: "${BROWSERLESS_QUEUED:-100}"
|
||||
# Per-session timeout in ms.
|
||||
TIMEOUT: "${BROWSERLESS_TIMEOUT:-60000}"
|
||||
# Optional webhook URL for Browserless error alerts.
|
||||
ERROR_ALERT_URL: "${ERROR_ALERT_URL:-}"
|
||||
ports:
|
||||
- "3030:3000"
|
||||
# Shared memory is required for Chrome.
|
||||
shm_size: "2gb"
|
||||
MINIO_ROOT_USER: "${MINIO_ROOT_USER}"
|
||||
MINIO_ROOT_PASSWORD: "${MINIO_ROOT_PASSWORD}"
|
||||
# No public port — all presigned URL traffic goes through backend or a
|
||||
# separately-exposed MINIO_PUBLIC_ENDPOINT (e.g. storage.libnovel.cc).
|
||||
expose:
|
||||
- "9000"
|
||||
- "9001"
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3000/json/version"]
|
||||
test: ["CMD", "mc", "ready", "local"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Kokoro-FastAPI (TTS) ────────────────────────────────────────────────────
|
||||
# CPU image; swap for ghcr.io/remsky/kokoro-fastapi-gpu:latest on NVIDIA hosts.
|
||||
# Models are baked in — no volume mount required for the default voice set.
|
||||
kokoro:
|
||||
image: ghcr.io/remsky/kokoro-fastapi-cpu:latest
|
||||
container_name: libnovel-kokoro
|
||||
# ─── MinIO bucket initialisation ─────────────────────────────────────────────
|
||||
minio-init:
|
||||
image: minio/mc:latest
|
||||
depends_on:
|
||||
minio:
|
||||
condition: service_healthy
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
mc alias set local http://minio:9000 $${MINIO_ROOT_USER} $${MINIO_ROOT_PASSWORD};
|
||||
mc mb --ignore-existing local/chapters;
|
||||
mc mb --ignore-existing local/audio;
|
||||
mc mb --ignore-existing local/avatars;
|
||||
mc mb --ignore-existing local/catalogue;
|
||||
echo 'buckets ready';
|
||||
"
|
||||
environment:
|
||||
MINIO_ROOT_USER: "${MINIO_ROOT_USER}"
|
||||
MINIO_ROOT_PASSWORD: "${MINIO_ROOT_PASSWORD}"
|
||||
|
||||
# ─── PocketBase (auth + structured data) ─────────────────────────────────────
|
||||
pocketbase:
|
||||
image: ghcr.io/muchobien/pocketbase:latest
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8880:8880"
|
||||
environment:
|
||||
PB_ADMIN_EMAIL: "${POCKETBASE_ADMIN_EMAIL}"
|
||||
PB_ADMIN_PASSWORD: "${POCKETBASE_ADMIN_PASSWORD}"
|
||||
# No public port — accessed only by backend/runner on the internal network.
|
||||
expose:
|
||||
- "8090"
|
||||
volumes:
|
||||
- pb_data:/pb_data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
|
||||
interval: 15s
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:8090/api/health"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Scraper ─────────────────────────────────────────────────────────────────
|
||||
scraper:
|
||||
build:
|
||||
context: ./scraper
|
||||
dockerfile: Dockerfile
|
||||
container_name: libnovel-scraper
|
||||
restart: unless-stopped
|
||||
# ─── PocketBase collection bootstrap ─────────────────────────────────────────
|
||||
pb-init:
|
||||
image: alpine:3.19
|
||||
depends_on:
|
||||
kokoro:
|
||||
pocketbase:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
BROWSERLESS_URL: "http://browserless:3000"
|
||||
BROWSERLESS_TOKEN: "${BROWSERLESS_TOKEN:-}"
|
||||
# content | scrape | cdp | direct — swap to test different strategies.
|
||||
BROWSERLESS_STRATEGY: "${BROWSERLESS_STRATEGY:-direct}"
|
||||
# Strategy for URL retrieval (chapter list). Default: content (browserless)
|
||||
BROWSERLESS_URL_STRATEGY: "${BROWSERLESS_URL_STRATEGY:-content}"
|
||||
# 0 → defaults to NumCPU inside the container.
|
||||
SCRAPER_WORKERS: "${SCRAPER_WORKERS:-0}"
|
||||
SCRAPER_STATIC_ROOT: "/app/static/books"
|
||||
SCRAPER_HTTP_ADDR: ":8080"
|
||||
LOG_LEVEL: "debug"
|
||||
# Kokoro-FastAPI TTS endpoint.
|
||||
KOKORO_URL: "${KOKORO_URL:-http://localhost:8880}"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE:-af_bella}"
|
||||
ports:
|
||||
- "8080:8080"
|
||||
POCKETBASE_URL: "http://pocketbase:8090"
|
||||
POCKETBASE_ADMIN_EMAIL: "${POCKETBASE_ADMIN_EMAIL}"
|
||||
POCKETBASE_ADMIN_PASSWORD: "${POCKETBASE_ADMIN_PASSWORD}"
|
||||
volumes:
|
||||
- static_books:/app/static/books
|
||||
- ./scripts/pb-init-v3.sh:/pb-init.sh:ro
|
||||
entrypoint: ["sh", "/pb-init.sh"]
|
||||
|
||||
# ─── Meilisearch (full-text search) ──────────────────────────────────────────
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:latest
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MEILI_MASTER_KEY: "${MEILI_MASTER_KEY}"
|
||||
MEILI_ENV: "${MEILI_ENV}"
|
||||
# No public port — backend/runner reach it via internal network.
|
||||
expose:
|
||||
- "7700"
|
||||
volumes:
|
||||
- meili_data:/meili_data
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:8080/health"]
|
||||
test: ["CMD", "wget", "-qO-", "http://127.0.0.1:7700/health"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Valkey (presign URL cache) ───────────────────────────────────────────────
|
||||
valkey:
|
||||
image: valkey/valkey:7-alpine
|
||||
restart: unless-stopped
|
||||
# No public port — backend/runner/ui reach it via internal network.
|
||||
expose:
|
||||
- "6379"
|
||||
volumes:
|
||||
- valkey_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "valkey-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Backend API ──────────────────────────────────────────────────────────────
|
||||
backend:
|
||||
image: kalekber/libnovel-backend:${GIT_TAG:-latest}
|
||||
build:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
target: backend
|
||||
args:
|
||||
VERSION: "${GIT_TAG}"
|
||||
COMMIT: "${GIT_COMMIT}"
|
||||
labels:
|
||||
com.centurylinklabs.watchtower.enable: "true"
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 35s
|
||||
depends_on:
|
||||
pb-init:
|
||||
condition: service_completed_successfully
|
||||
pocketbase:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
meilisearch:
|
||||
condition: service_healthy
|
||||
valkey:
|
||||
condition: service_healthy
|
||||
# No public port — all traffic is routed via Caddy.
|
||||
expose:
|
||||
- "8080"
|
||||
environment:
|
||||
<<: *infra-env
|
||||
BACKEND_HTTP_ADDR: ":8080"
|
||||
LOG_LEVEL: "${LOG_LEVEL}"
|
||||
KOKORO_URL: "${KOKORO_URL}"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE}"
|
||||
healthcheck:
|
||||
test: ["CMD", "/healthcheck", "http://localhost:8080/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ─── Runner (background task worker) ─────────────────────────────────────────
|
||||
runner:
|
||||
image: kalekber/libnovel-runner:${GIT_TAG:-latest}
|
||||
build:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
target: runner
|
||||
args:
|
||||
VERSION: "${GIT_TAG}"
|
||||
COMMIT: "${GIT_COMMIT}"
|
||||
labels:
|
||||
com.centurylinklabs.watchtower.enable: "true"
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 135s
|
||||
depends_on:
|
||||
pb-init:
|
||||
condition: service_completed_successfully
|
||||
pocketbase:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_healthy
|
||||
meilisearch:
|
||||
condition: service_healthy
|
||||
valkey:
|
||||
condition: service_healthy
|
||||
# Metrics endpoint — internal only; expose publicly via Caddy if needed.
|
||||
expose:
|
||||
- "9091"
|
||||
environment:
|
||||
<<: *infra-env
|
||||
LOG_LEVEL: "${LOG_LEVEL}"
|
||||
# Runner tuning
|
||||
RUNNER_POLL_INTERVAL: "${RUNNER_POLL_INTERVAL}"
|
||||
RUNNER_MAX_CONCURRENT_SCRAPE: "${RUNNER_MAX_CONCURRENT_SCRAPE}"
|
||||
RUNNER_MAX_CONCURRENT_AUDIO: "${RUNNER_MAX_CONCURRENT_AUDIO}"
|
||||
RUNNER_WORKER_ID: "${RUNNER_WORKER_ID}"
|
||||
RUNNER_TIMEOUT: "${RUNNER_TIMEOUT}"
|
||||
RUNNER_METRICS_ADDR: "${RUNNER_METRICS_ADDR}"
|
||||
# Suppress the on-startup catalogue walk — catalogue_refresh now skips
|
||||
# books already in Meilisearch, so a full walk on every restart is wasteful.
|
||||
# The 24h periodic ticker (CatalogueRefreshInterval) still fires normally.
|
||||
RUNNER_SKIP_INITIAL_CATALOGUE_REFRESH: "true"
|
||||
# Kokoro-FastAPI TTS endpoint
|
||||
KOKORO_URL: "${KOKORO_URL}"
|
||||
KOKORO_VOICE: "${KOKORO_VOICE}"
|
||||
healthcheck:
|
||||
# The runner writes /tmp/runner.alive on every poll.
|
||||
# 120s = 2× the default 30s poll interval with generous headroom.
|
||||
test: ["CMD", "/healthcheck", "file", "/tmp/runner.alive", "120"]
|
||||
interval: 60s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ─── SvelteKit UI ─────────────────────────────────────────────────────────────
|
||||
ui:
|
||||
image: kalekber/libnovel-ui:${GIT_TAG:-latest}
|
||||
build:
|
||||
context: ./ui
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
BUILD_VERSION: "${GIT_TAG}"
|
||||
BUILD_COMMIT: "${GIT_COMMIT}"
|
||||
labels:
|
||||
com.centurylinklabs.watchtower.enable: "true"
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 35s
|
||||
depends_on:
|
||||
pb-init:
|
||||
condition: service_completed_successfully
|
||||
backend:
|
||||
condition: service_healthy
|
||||
pocketbase:
|
||||
condition: service_healthy
|
||||
valkey:
|
||||
condition: service_healthy
|
||||
# No public port — all traffic via Caddy.
|
||||
expose:
|
||||
- "3000"
|
||||
environment:
|
||||
# ORIGIN must match the public URL Caddy serves on.
|
||||
# adapter-node uses this for SvelteKit's built-in CSRF origin check.
|
||||
ORIGIN: "${ORIGIN}"
|
||||
BACKEND_API_URL: "http://backend:8080"
|
||||
POCKETBASE_URL: "http://pocketbase:8090"
|
||||
POCKETBASE_ADMIN_EMAIL: "${POCKETBASE_ADMIN_EMAIL}"
|
||||
POCKETBASE_ADMIN_PASSWORD: "${POCKETBASE_ADMIN_PASSWORD}"
|
||||
AUTH_SECRET: "${AUTH_SECRET}"
|
||||
PUBLIC_MINIO_PUBLIC_URL: "${MINIO_PUBLIC_ENDPOINT}"
|
||||
# Valkey
|
||||
VALKEY_ADDR: "valkey:6379"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://127.0.0.1:3000/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ─── CrowdSec (threat detection + IP blocking) ───────────────────────────────
|
||||
# Reads Caddy JSON access logs from the shared caddy_logs volume and enforces
|
||||
# decisions via the Caddy bouncer plugin.
|
||||
crowdsec:
|
||||
image: crowdsecurity/crowdsec:latest
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GID: "1000"
|
||||
COLLECTIONS: "crowdsecurity/caddy crowdsecurity/http-dos crowdsecurity/base-http-scenarios"
|
||||
volumes:
|
||||
- crowdsec_data:/var/lib/crowdsec/data
|
||||
- ./crowdsec/acquis.yaml:/etc/crowdsec/acquis.yaml:ro
|
||||
- caddy_logs:/var/log/caddy:ro
|
||||
expose:
|
||||
- "8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "cscli", "version"]
|
||||
interval: 20s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
|
||||
# ─── CrowdSec bouncer registration ───────────────────────────────────────────
|
||||
# One-shot: registers the Caddy bouncer with the CrowdSec LAPI and writes the
|
||||
# generated API key to crowdsec/.crowdsec.env, which Caddy reads via env_file.
|
||||
# Uses the Docker socket to exec cscli inside the running crowdsec container.
|
||||
crowdsec-init:
|
||||
image: docker:cli
|
||||
depends_on:
|
||||
crowdsec:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- ./crowdsec:/crowdsec-out
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
out=/crowdsec-out/.crowdsec.env;
|
||||
existing=$$(grep -s '^CROWDSEC_API_KEY=.' \"$$out\" | cut -d= -f2-);
|
||||
if [ -n \"$$existing\" ]; then
|
||||
echo 'crowdsec-init: key already present, skipping registration';
|
||||
exit 0;
|
||||
fi;
|
||||
container=$$(docker ps --filter name=crowdsec --filter status=running --format '{{.Names}}' | grep -v init | head -1);
|
||||
echo \"crowdsec-init: using container $$container\";
|
||||
docker exec $$container cscli bouncers delete caddy-bouncer 2>/dev/null || true;
|
||||
key=$$(docker exec $$container cscli bouncers add caddy-bouncer -o raw 2>&1);
|
||||
if [ -z \"$$key\" ]; then
|
||||
echo 'crowdsec-init: ERROR — failed to obtain bouncer key' >&2;
|
||||
exit 1;
|
||||
fi;
|
||||
printf 'CROWDSEC_API_KEY=%s\n' \"$$key\" > \"$$out\";
|
||||
echo \"crowdsec-init: bouncer key written (key length: $${#key})\";
|
||||
"
|
||||
restart: "no"
|
||||
|
||||
|
||||
# ─── Caddy (reverse proxy + automatic HTTPS) ──────────────────────────────────
|
||||
# Custom build includes github.com/mholt/caddy-ratelimit and
|
||||
# github.com/hslatman/caddy-crowdsec-bouncer/http.
|
||||
caddy:
|
||||
image: kalekber/libnovel-caddy:${GIT_TAG:-latest}
|
||||
build:
|
||||
context: ./caddy
|
||||
dockerfile: Dockerfile
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
backend:
|
||||
condition: service_healthy
|
||||
ui:
|
||||
condition: service_healthy
|
||||
crowdsec-init:
|
||||
condition: service_completed_successfully
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
- "443:443/udp" # HTTP/3 (QUIC)
|
||||
environment:
|
||||
DOMAIN: "${DOMAIN}"
|
||||
CADDY_ACME_EMAIL: "${CADDY_ACME_EMAIL}"
|
||||
env_file:
|
||||
- path: ./crowdsec/.crowdsec.env
|
||||
required: false
|
||||
volumes:
|
||||
- ./Caddyfile:/etc/caddy/Caddyfile:ro
|
||||
- ./caddy/errors:/srv/errors:ro
|
||||
- caddy_data:/data
|
||||
- caddy_config:/config
|
||||
- caddy_logs:/var/log/caddy
|
||||
|
||||
# ─── Watchtower (auto-redeploy custom services on new images) ────────────────
|
||||
# Only watches services labelled com.centurylinklabs.watchtower.enable=true.
|
||||
# Third-party infra images (minio, pocketbase, meilisearch, etc.) are excluded.
|
||||
watchtower:
|
||||
image: containrrr/watchtower:latest
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
command: --label-enable --interval 300 --cleanup
|
||||
environment:
|
||||
WATCHTOWER_NOTIFICATIONS: "${WATCHTOWER_NOTIFICATIONS}"
|
||||
WATCHTOWER_NOTIFICATION_URL: "${WATCHTOWER_NOTIFICATION_URL}"
|
||||
DOCKER_API_VERSION: "1.44"
|
||||
|
||||
# ─── Shared PostgreSQL (Fider + GlitchTip + Umami) ───────────────────────────
|
||||
# A single Postgres instance hosting three separate databases.
|
||||
# PocketBase uses its own embedded SQLite; this postgres is only for the
|
||||
# three new services below.
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER: "${POSTGRES_USER}"
|
||||
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
|
||||
POSTGRES_DB: postgres
|
||||
expose:
|
||||
- "5432"
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD", "pg_isready", "-U", "${POSTGRES_USER}"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Postgres database initialisation ────────────────────────────────────────
|
||||
# One-shot: creates the fider, glitchtip, and umami databases if missing.
|
||||
postgres-init:
|
||||
image: postgres:16-alpine
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
PGPASSWORD: "${POSTGRES_PASSWORD}"
|
||||
entrypoint: >
|
||||
/bin/sh -c "
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -tc \"SELECT 1 FROM pg_database WHERE datname='fider'\" | grep -q 1 ||
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -c \"CREATE DATABASE fider\";
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -tc \"SELECT 1 FROM pg_database WHERE datname='glitchtip'\" | grep -q 1 ||
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -c \"CREATE DATABASE glitchtip\";
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -tc \"SELECT 1 FROM pg_database WHERE datname='umami'\" | grep -q 1 ||
|
||||
psql -h postgres -U ${POSTGRES_USER} -d postgres -c \"CREATE DATABASE umami\";
|
||||
echo 'postgres-init: databases ready';
|
||||
"
|
||||
restart: "no"
|
||||
|
||||
# ─── Fider (user feedback & feature requests) ─────────────────────────────────
|
||||
fider:
|
||||
image: getfider/fider:stable
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
postgres-init:
|
||||
condition: service_completed_successfully
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
expose:
|
||||
- "3000"
|
||||
environment:
|
||||
BASE_URL: "${FIDER_BASE_URL}"
|
||||
DATABASE_URL: "postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/fider?sslmode=disable"
|
||||
JWT_SECRET: "${FIDER_JWT_SECRET}"
|
||||
# Email: noreply mode — emails are suppressed (logged to stdout).
|
||||
# Fider still requires SMTP vars to be non-empty even in noreply mode.
|
||||
EMAIL_NOREPLY: "noreply@libnovel.cc"
|
||||
EMAIL_SMTP_HOST: "localhost"
|
||||
EMAIL_SMTP_PORT: "25"
|
||||
# Disable outbound email — set real SMTP values to enable.
|
||||
EMAIL_NOREPLY_MODE: "true"
|
||||
|
||||
# ─── GlitchTip DB migration (one-shot) ───────────────────────────────────────
|
||||
glitchtip-migrate:
|
||||
image: glitchtip/glitchtip:latest
|
||||
depends_on:
|
||||
postgres-init:
|
||||
condition: service_completed_successfully
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: "postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/glitchtip"
|
||||
SECRET_KEY: "${GLITCHTIP_SECRET_KEY}"
|
||||
GLITCHTIP_DOMAIN: "${GLITCHTIP_DOMAIN}"
|
||||
EMAIL_URL: "consolemail://"
|
||||
DEFAULT_FROM_EMAIL: "errors@libnovel.cc"
|
||||
VALKEY_URL: "redis://valkey:6379/1"
|
||||
command: "./manage.py migrate"
|
||||
restart: "no"
|
||||
|
||||
# ─── GlitchTip web (error tracking UI + API) ─────────────────────────────────
|
||||
glitchtip-web:
|
||||
image: glitchtip/glitchtip:latest
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
glitchtip-migrate:
|
||||
condition: service_completed_successfully
|
||||
valkey:
|
||||
condition: service_healthy
|
||||
expose:
|
||||
- "8000"
|
||||
environment:
|
||||
DATABASE_URL: "postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/glitchtip"
|
||||
SECRET_KEY: "${GLITCHTIP_SECRET_KEY}"
|
||||
GLITCHTIP_DOMAIN: "${GLITCHTIP_DOMAIN}"
|
||||
EMAIL_URL: "consolemail://"
|
||||
DEFAULT_FROM_EMAIL: "errors@libnovel.cc"
|
||||
VALKEY_URL: "redis://valkey:6379/1"
|
||||
PORT: "8000"
|
||||
ENABLE_USER_REGISTRATION: "false"
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/0/')"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── GlitchTip worker (background task processor) ─────────────────────────────
|
||||
glitchtip-worker:
|
||||
image: glitchtip/glitchtip:latest
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
glitchtip-migrate:
|
||||
condition: service_completed_successfully
|
||||
valkey:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: "postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/glitchtip"
|
||||
SECRET_KEY: "${GLITCHTIP_SECRET_KEY}"
|
||||
GLITCHTIP_DOMAIN: "${GLITCHTIP_DOMAIN}"
|
||||
EMAIL_URL: "consolemail://"
|
||||
DEFAULT_FROM_EMAIL: "errors@libnovel.cc"
|
||||
VALKEY_URL: "redis://valkey:6379/1"
|
||||
SERVER_ROLE: "worker"
|
||||
|
||||
# ─── Umami (page analytics) ───────────────────────────────────────────────────
|
||||
umami:
|
||||
image: ghcr.io/umami-software/umami:postgresql-latest
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
postgres-init:
|
||||
condition: service_completed_successfully
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
expose:
|
||||
- "3000"
|
||||
environment:
|
||||
DATABASE_URL: "postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/umami"
|
||||
APP_SECRET: "${UMAMI_APP_SECRET}"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3000/api/heartbeat"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Dozzle (Docker log viewer) ───────────────────────────────────────────────
|
||||
dozzle:
|
||||
image: amir20/dozzle:latest
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- ./dozzle/users.yml:/data/users.yml:ro
|
||||
expose:
|
||||
- "8080"
|
||||
environment:
|
||||
DOZZLE_AUTH_PROVIDER: simple
|
||||
DOZZLE_HOSTNAME: "logs.libnovel.cc"
|
||||
healthcheck:
|
||||
test: ["CMD", "/dozzle", "healthcheck"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Uptime Kuma (uptime monitoring) ──────────────────────────────────────────
|
||||
uptime-kuma:
|
||||
image: louislam/uptime-kuma:1
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- uptime_kuma_data:/app/data
|
||||
expose:
|
||||
- "3001"
|
||||
healthcheck:
|
||||
test: ["CMD", "extra/healthcheck"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ─── Gotify (push notifications) ──────────────────────────────────────────────
|
||||
gotify:
|
||||
image: gotify/server:latest
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- gotify_data:/app/data
|
||||
expose:
|
||||
- "80"
|
||||
environment:
|
||||
GOTIFY_DEFAULTUSER_NAME: "${GOTIFY_ADMIN_USER}"
|
||||
GOTIFY_DEFAULTUSER_PASS: "${GOTIFY_ADMIN_PASS}"
|
||||
GOTIFY_SERVER_PORT: "80"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:80/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
volumes:
|
||||
static_books:
|
||||
minio_data:
|
||||
pb_data:
|
||||
meili_data:
|
||||
valkey_data:
|
||||
caddy_data:
|
||||
caddy_config:
|
||||
caddy_logs:
|
||||
crowdsec_data:
|
||||
postgres_data:
|
||||
uptime_kuma_data:
|
||||
gotify_data:
|
||||
|
||||
82
docs/api-endpoints.md
Normal file
82
docs/api-endpoints.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# API Endpoint Reference
|
||||
|
||||
> **Routing ownership map**: see [`docs/d2/api-routing.svg`](d2/api-routing.svg) (source: [`docs/d2/api-routing.d2`](d2/api-routing.d2)) for a visual overview of which paths Caddy sends to the backend directly vs. through SvelteKit, with auth levels colour-coded.
|
||||
|
||||
All traffic enters through **Caddy :443**. Caddy routes a subset of paths directly to the Go backend (bypassing SvelteKit); everything else goes to SvelteKit, which enforces auth before proxying onward.
|
||||
|
||||
## Health / Version
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `GET` | `/health` | — | Liveness probe. Returns `{"ok":true}`. |
|
||||
| `GET` | `/api/version` | — | Build version + commit hash. |
|
||||
|
||||
## Scrape Jobs (admin)
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `POST` | `/scrape` | admin | Enqueue full catalogue scrape. |
|
||||
| `POST` | `/scrape/book` | admin | Enqueue single-book scrape `{url}`. |
|
||||
| `POST` | `/scrape/book/range` | admin | Enqueue range scrape `{url, from, to?}`. |
|
||||
| `GET` | `/api/scrape/status` | admin | Current job status. |
|
||||
| `GET` | `/api/scrape/tasks` | admin | All scrape task records. |
|
||||
| `POST` | `/api/cancel-task/{id}` | admin | Cancel a pending task. |
|
||||
|
||||
## Browse / Catalogue
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `GET` | `/api/browse` | — | Live novelfire.net browse (MinIO page-1 cache). Legacy — used by save-browse subcommand. |
|
||||
| `GET` | `/api/catalogue` | — | **Primary browse endpoint.** Meilisearch-backed, paginated. Params: `q`, `page`, `limit`, `genre`, `status`, `sort` (`popular`\|`new`\|`update`\|`rank`\|`top-rated`). Falls back to empty when Meilisearch is not configured. |
|
||||
| `GET` | `/api/search` | — | Full-text search: Meilisearch local results merged with live novelfire.net remote results. Param: `q` (≥ 2 chars). Used by iOS app. |
|
||||
| `GET` | `/api/ranking` | — | Top-ranked novels from PocketBase. |
|
||||
| `GET` | `/api/cover/{domain}/{slug}` | — | Proxy cover image from MinIO (redirect to presigned URL). |
|
||||
|
||||
## Book / Chapter Content
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `GET` | `/api/book-preview/{slug}` | — | Returns stored metadata + chapter list, or enqueues a scrape task (202) if unknown. |
|
||||
| `GET` | `/api/chapter-text/{slug}/{n}` | — | Chapter content as plain text (markdown stripped). |
|
||||
| `GET` | `/api/chapter-markdown/{slug}/{n}` | — | Chapter content as raw markdown from MinIO. |
|
||||
| `POST` | `/api/reindex/{slug}` | admin | Rebuild `chapters_idx` from MinIO objects. |
|
||||
|
||||
## Audio
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `POST` | `/api/audio/{slug}/{n}` | — | Trigger Kokoro TTS generation. Body: `{voice?}`. Returns `200 {status:"done"}` if cached, `202 {task_id, status}` if enqueued. |
|
||||
| `GET` | `/api/audio/status/{slug}/{n}` | — | Poll audio generation status. Param: `voice`. Returns `{status, task_id?, error?}`. |
|
||||
| `GET` | `/api/audio-proxy/{slug}/{n}` | — | Redirect to presigned MinIO audio URL. |
|
||||
| `GET` | `/api/voices` | — | List available Kokoro voices. Returns `{voices:[]}` on error. |
|
||||
|
||||
## Presigned URLs
|
||||
|
||||
All presign endpoints return a `302` redirect to a short-lived MinIO presigned
|
||||
URL. The URL is cached in Valkey (TTL ~55 min) to avoid regenerating on every
|
||||
request.
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `GET` | `/api/presign/chapter/{slug}/{n}` | — | Presigned URL for chapter markdown object. |
|
||||
| `GET` | `/api/presign/audio/{slug}/{n}` | — | Presigned URL for audio MP3. Param: `voice`. |
|
||||
| `GET` | `/api/presign/voice-sample/{voice}` | — | Presigned URL for voice sample MP3. |
|
||||
| `GET` | `/api/presign/avatar-upload/{userId}` | user | Presigned PUT URL for avatar upload. |
|
||||
| `GET` | `/api/presign/avatar/{userId}` | — | Presigned GET URL for avatar image. |
|
||||
|
||||
## Reading Progress
|
||||
|
||||
Session-scoped (anonymous via cookie session ID, or tied to authenticated user).
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| `GET` | `/api/progress` | — | Get all reading progress for the current session/user. |
|
||||
| `POST` | `/api/progress/{slug}` | — | Set progress. Body: `{chapter}`. |
|
||||
| `DELETE` | `/api/progress/{slug}` | — | Delete progress for a book. |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Auth**: The backend does not enforce auth itself — the SvelteKit UI layer enforces admin/user guards before proxying requests. The backend trusts all incoming requests.
|
||||
- **`/api/catalogue` vs `/api/browse`**: `/api/catalogue` is the primary UI endpoint (Meilisearch, always-local, fast). `/api/browse` hits or caches the live novelfire.net browse page and is only used internally by the `save-browse` subcommand.
|
||||
- **Meilisearch fallback**: When `MEILI_URL` is unset, `/api/catalogue` returns `{books:[], has_next:false}` and `/api/search` falls back to a PocketBase substring scan.
|
||||
- **`BACKEND_API_URL`**: The SvelteKit UI reads this env var (default `http://localhost:8080`) to reach the backend server-side. In docker-compose it is set to `http://backend:8080`.
|
||||
201
docs/d2/api-routing.d2
Normal file
201
docs/d2/api-routing.d2
Normal file
@@ -0,0 +1,201 @@
|
||||
direction: right
|
||||
|
||||
# ─── Legend ───────────────────────────────────────────────────────────────────
|
||||
|
||||
legend: Legend {
|
||||
style.fill: "#fafafa"
|
||||
style.stroke: "#d4d4d8"
|
||||
|
||||
pub: public {
|
||||
style.fill: "#f0fdf4"
|
||||
style.font-color: "#15803d"
|
||||
style.stroke: "#86efac"
|
||||
}
|
||||
user: user auth {
|
||||
style.fill: "#eff6ff"
|
||||
style.font-color: "#1d4ed8"
|
||||
style.stroke: "#93c5fd"
|
||||
}
|
||||
adm: admin only {
|
||||
style.fill: "#fff7ed"
|
||||
style.font-color: "#c2410c"
|
||||
style.stroke: "#fdba74"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Client ───────────────────────────────────────────────────────────────────
|
||||
|
||||
client: Browser / iOS App {
|
||||
shape: person
|
||||
style.fill: "#fff9e6"
|
||||
}
|
||||
|
||||
# ─── Caddy ────────────────────────────────────────────────────────────────────
|
||||
|
||||
caddy: Caddy :443 {
|
||||
shape: rectangle
|
||||
style.fill: "#f1f5f9"
|
||||
label: "Caddy :443\ncustom build · caddy-ratelimit\nsecurity headers · rate limiting\nstatic error pages"
|
||||
}
|
||||
|
||||
# ─── SvelteKit UI ─────────────────────────────────────────────────────────────
|
||||
# Handles: auth enforcement, session, all /api/* routes that have SK counterparts
|
||||
|
||||
sk: SvelteKit UI :3000 {
|
||||
style.fill: "#fef3c7"
|
||||
|
||||
auth: Auth {
|
||||
style.fill: "#fde68a"
|
||||
style.stroke: "#f59e0b"
|
||||
label: "POST /api/auth/login\nPOST /api/auth/register\nPOST /api/auth/change-password\nGET /api/auth/session"
|
||||
}
|
||||
|
||||
catalogue_sk: Catalogue {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/catalogue-page\nGET /api/search"
|
||||
}
|
||||
|
||||
book_sk: Book {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/book/{slug}\nGET /api/chapter/{slug}/{n}\nGET /api/chapter-text-preview/{slug}/{n}"
|
||||
}
|
||||
|
||||
scrape_sk: Scrape (admin) {
|
||||
style.fill: "#fff7ed"
|
||||
style.stroke: "#fdba74"
|
||||
label: "GET /api/scrape/status\nGET /api/scrape/tasks\nPOST /api/scrape\nPOST /api/scrape/range\nPOST /api/scrape/cancel/{id}"
|
||||
}
|
||||
|
||||
audio_sk: Audio {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "POST /api/audio/{slug}/{n}\nGET /api/audio/status/{slug}/{n}\nGET /api/voices"
|
||||
}
|
||||
|
||||
presign_sk: Presigned URLs {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/presign/chapter/{slug}/{n}\nGET /api/presign/audio/{slug}/{n}\nGET /api/presign/voice-sample/{voice}"
|
||||
}
|
||||
|
||||
presign_user: Presigned URLs (user) {
|
||||
style.fill: "#eff6ff"
|
||||
style.stroke: "#93c5fd"
|
||||
label: "GET /api/presign/avatar-upload/{userId}\nGET /api/presign/avatar/{userId}"
|
||||
}
|
||||
|
||||
progress_sk: Progress {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/progress\nPOST /api/progress/{slug}\nDELETE /api/progress/{slug}"
|
||||
}
|
||||
|
||||
library_sk: Library {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/library\nPOST /api/library/{slug}\nDELETE /api/library/{slug}"
|
||||
}
|
||||
|
||||
comments_sk: Comments {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/comments/{slug}\nPOST /api/comments/{slug}"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Go Backend ───────────────────────────────────────────────────────────────
|
||||
# Caddy proxies these paths directly — no SvelteKit auth layer
|
||||
|
||||
be: Backend API :8080 {
|
||||
style.fill: "#eef3ff"
|
||||
|
||||
health_be: Health {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /health\nGET /api/version"
|
||||
}
|
||||
|
||||
scrape_be: Scrape admin (direct) {
|
||||
style.fill: "#fff7ed"
|
||||
style.stroke: "#fdba74"
|
||||
label: "POST /scrape\nPOST /scrape/book\nPOST /scrape/book/range"
|
||||
}
|
||||
|
||||
catalogue_be: Catalogue {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/browse\nGET /api/catalogue\nGET /api/ranking\nGET /api/cover/{domain}/{slug}"
|
||||
}
|
||||
|
||||
book_be: Book / Chapter {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/book-preview/{slug}\nGET /api/chapter-text/{slug}/{n}\nGET /api/chapter-markdown/{slug}/{n}\nPOST /api/reindex/{slug} ⚠ admin"
|
||||
}
|
||||
|
||||
audio_be: Audio {
|
||||
style.fill: "#f0fdf4"
|
||||
style.stroke: "#86efac"
|
||||
label: "GET /api/audio-proxy/{slug}/{n}\nGET /api/voices"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Storage ──────────────────────────────────────────────────────────────────
|
||||
|
||||
storage: Storage {
|
||||
style.fill: "#eaf7ea"
|
||||
|
||||
pb: PocketBase :8090 {
|
||||
shape: cylinder
|
||||
label: "auth · books · progress\ncomments · library\nscrape_jobs · audio_cache"
|
||||
}
|
||||
mn: MinIO :9000 {
|
||||
shape: cylinder
|
||||
label: "chapters · audio\navatars · browse"
|
||||
}
|
||||
ms: Meilisearch :7700 {
|
||||
shape: cylinder
|
||||
label: "index: books"
|
||||
}
|
||||
vk: Valkey :6379 {
|
||||
shape: cylinder
|
||||
label: "presign URL cache"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Caddy routing ────────────────────────────────────────────────────────────
|
||||
|
||||
client -> caddy: HTTPS :443
|
||||
|
||||
caddy -> sk: "/* (catch-all)\n→ SvelteKit handles auth"
|
||||
caddy -> be: "/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/* /api/chapter-markdown/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/* /api/catalogue /api/ranking"
|
||||
caddy -> storage.mn: "/avatars/*\n/audio/*\n/chapters/*\n(presigned MinIO GETs)"
|
||||
|
||||
# ─── SvelteKit → Backend (server-side proxy) ──────────────────────────────────
|
||||
|
||||
sk.catalogue_sk -> be.catalogue_be: internal proxy
|
||||
sk.book_sk -> be.book_be: internal proxy
|
||||
sk.audio_sk -> be.audio_be: internal proxy
|
||||
sk.presign_sk -> storage.vk: check cache
|
||||
sk.presign_sk -> storage.mn: generate presign
|
||||
sk.presign_user -> storage.mn: generate presign
|
||||
|
||||
# ─── SvelteKit → Storage (direct) ────────────────────────────────────────────
|
||||
|
||||
sk.auth -> storage.pb: sessions / users
|
||||
sk.scrape_sk -> storage.pb: scrape job records
|
||||
sk.progress_sk -> storage.pb
|
||||
sk.library_sk -> storage.pb
|
||||
sk.comments_sk -> storage.pb
|
||||
|
||||
# ─── Backend → Storage ────────────────────────────────────────────────────────
|
||||
|
||||
be.catalogue_be -> storage.ms: full-text search
|
||||
be.catalogue_be -> storage.pb: ranking records
|
||||
be.catalogue_be -> storage.mn: cover presign
|
||||
be.book_be -> storage.mn: chapter objects
|
||||
be.book_be -> storage.pb: book metadata
|
||||
be.audio_be -> storage.mn: audio presign
|
||||
be.audio_be -> storage.vk: presign cache
|
||||
127
docs/d2/api-routing.svg
Normal file
127
docs/d2/api-routing.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 57 KiB |
154
docs/d2/architecture.d2
Normal file
154
docs/d2/architecture.d2
Normal file
@@ -0,0 +1,154 @@
|
||||
direction: right
|
||||
|
||||
# ─── External ─────────────────────────────────────────────────────────────────
|
||||
|
||||
novelfire: novelfire.net {
|
||||
shape: cloud
|
||||
style.fill: "#f0f4ff"
|
||||
}
|
||||
|
||||
kokoro: Kokoro-FastAPI TTS {
|
||||
shape: cloud
|
||||
style.fill: "#f0f4ff"
|
||||
}
|
||||
|
||||
letsencrypt: Let's Encrypt {
|
||||
shape: cloud
|
||||
style.fill: "#f0f4ff"
|
||||
}
|
||||
|
||||
browser: Browser / iOS App {
|
||||
shape: person
|
||||
style.fill: "#fff9e6"
|
||||
}
|
||||
|
||||
# ─── Init containers (one-shot) ───────────────────────────────────────────────
|
||||
|
||||
init: Init containers {
|
||||
style.fill: "#f5f5f5"
|
||||
style.stroke-dash: 4
|
||||
|
||||
minio-init: minio-init {
|
||||
shape: rectangle
|
||||
label: "minio-init\n(mc: create buckets)"
|
||||
}
|
||||
|
||||
pb-init: pb-init {
|
||||
shape: rectangle
|
||||
label: "pb-init\n(bootstrap collections)"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Storage ──────────────────────────────────────────────────────────────────
|
||||
|
||||
storage: Storage {
|
||||
style.fill: "#eaf7ea"
|
||||
|
||||
minio: MinIO {
|
||||
shape: cylinder
|
||||
label: "MinIO :9000\n\nbuckets:\n chapters\n audio\n avatars\n catalogue"
|
||||
}
|
||||
|
||||
pocketbase: PocketBase {
|
||||
shape: cylinder
|
||||
label: "PocketBase :8090\n\ncollections:\n books chapters_idx\n audio_cache progress\n scrape_jobs app_users\n ranking"
|
||||
}
|
||||
|
||||
valkey: Valkey {
|
||||
shape: cylinder
|
||||
label: "Valkey :6379\n\n(presign URL cache\nTTL-based, shared)"
|
||||
}
|
||||
|
||||
meilisearch: Meilisearch {
|
||||
shape: cylinder
|
||||
label: "Meilisearch :7700\n\nindices:\n books"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Application ──────────────────────────────────────────────────────────────
|
||||
|
||||
app: Application {
|
||||
style.fill: "#eef3ff"
|
||||
|
||||
caddy: caddy {
|
||||
shape: rectangle
|
||||
label: "Caddy :443 / :80\ncustom build + caddy-ratelimit\n\nfeatures:\n auto-HTTPS (Let's Encrypt)\n security headers\n rate limiting (per-IP)\n static error pages (502/503/504)"
|
||||
}
|
||||
|
||||
backend: backend {
|
||||
shape: rectangle
|
||||
label: "Backend API :8080\n(Go — HTTP API server)"
|
||||
}
|
||||
|
||||
runner: runner {
|
||||
shape: rectangle
|
||||
label: "Runner :9091\n(Go — background worker\nscraping + TTS jobs\n/metrics endpoint)"
|
||||
}
|
||||
|
||||
ui: ui {
|
||||
shape: rectangle
|
||||
label: "SvelteKit UI :3000\n(adapter-node)"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Ops ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
ops: Ops {
|
||||
style.fill: "#fef9ec"
|
||||
|
||||
watchtower: Watchtower {
|
||||
shape: rectangle
|
||||
label: "Watchtower\n(containrrr/watchtower)\n\npolls every 5 min\nautopulls + redeploys:\n backend · runner · ui"
|
||||
}
|
||||
}
|
||||
|
||||
# ─── Init → Storage deps ──────────────────────────────────────────────────────
|
||||
|
||||
init.minio-init -> storage.minio: create buckets {style.stroke-dash: 4}
|
||||
init.pb-init -> storage.pocketbase: bootstrap schema {style.stroke-dash: 4}
|
||||
|
||||
# ─── App → Storage ────────────────────────────────────────────────────────────
|
||||
|
||||
app.backend -> storage.minio: blobs (chapters, audio,\navatars, browse)
|
||||
app.backend -> storage.pocketbase: structured records\n(books, progress, jobs…)
|
||||
app.backend -> storage.valkey: cache presigned URLs\n(SET/GET with TTL)
|
||||
|
||||
app.runner -> storage.minio: write chapter markdown\n& audio MP3s
|
||||
app.runner -> storage.pocketbase: read/update scrape jobs\nwrite book records
|
||||
app.runner -> storage.meilisearch: index books on\nscrape completion
|
||||
|
||||
app.ui -> storage.valkey: read presigned URL cache
|
||||
app.ui -> storage.pocketbase: auth, progress,\ncomments, settings
|
||||
|
||||
# ─── App internal ─────────────────────────────────────────────────────────────
|
||||
|
||||
app.ui -> app.backend: REST API calls (server-side)\n/api/catalogue /api/book-preview\n/api/chapter-text /api/audio etc.
|
||||
|
||||
# ─── Caddy routing ────────────────────────────────────────────────────────────
|
||||
# Routes sent directly to backend (no SvelteKit counterpart):
|
||||
# /health /scrape*
|
||||
# /api/browse /api/book-preview/* /api/chapter-text/*
|
||||
# /api/reindex/* /api/cover/* /api/audio-proxy/*
|
||||
# Routes sent to MinIO:
|
||||
# /avatars/*
|
||||
# Everything else → SvelteKit UI (including /api/scrape/*, /api/chapter-text-preview/*)
|
||||
|
||||
app.caddy -> app.ui: "/* (catch-all)\n/api/scrape/*\n/api/chapter-text-preview/*\n→ SvelteKit (auth enforced)"
|
||||
app.caddy -> app.backend: "/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/*"
|
||||
app.caddy -> storage.minio: "/avatars/*\n/audio/*\n/chapters/*\n(presigned MinIO GETs)"
|
||||
|
||||
# ─── External → App ───────────────────────────────────────────────────────────
|
||||
|
||||
app.runner -> novelfire: scrape\n(HTTP GET)
|
||||
app.runner -> kokoro: TTS generation\n(HTTP POST)
|
||||
app.caddy -> letsencrypt: ACME certificate\n(TLS-ALPN-01)
|
||||
|
||||
# ─── Ops → Docker socket ──────────────────────────────────────────────────────
|
||||
|
||||
ops.watchtower -> app.backend: watch (label-enabled)
|
||||
ops.watchtower -> app.runner: watch (label-enabled)
|
||||
ops.watchtower -> app.ui: watch (label-enabled)
|
||||
|
||||
# ─── Browser ──────────────────────────────────────────────────────────────────
|
||||
|
||||
browser -> app.caddy: HTTPS :443\n(single entry point)
|
||||
129
docs/d2/architecture.svg
Normal file
129
docs/d2/architecture.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 58 KiB |
72
docs/mermaid/architecture.mermaid.md
Normal file
72
docs/mermaid/architecture.mermaid.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# Architecture Overview
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
%% ── External ──────────────────────────────────────────────────────────
|
||||
NF([novelfire.net])
|
||||
KK([Kokoro-FastAPI TTS])
|
||||
LE([Let's Encrypt])
|
||||
CL([Browser / iOS App])
|
||||
|
||||
%% ── Init containers ───────────────────────────────────────────────────
|
||||
subgraph INIT["Init containers (one-shot)"]
|
||||
MI[minio-init\nmc: create buckets]
|
||||
PI[pb-init\nbootstrap collections]
|
||||
end
|
||||
|
||||
%% ── Storage ───────────────────────────────────────────────────────────
|
||||
subgraph STORAGE["Storage"]
|
||||
MN[(MinIO :9000\nchapters · audio\navatars · browse)]
|
||||
PB[(PocketBase :8090\nbooks · chapters_idx\naudio_cache · progress\nscrape_jobs · app_users · ranking)]
|
||||
VK[(Valkey :6379\npresign URL cache\nTTL-based · shared)]
|
||||
MS[(Meilisearch :7700\nindex: books)]
|
||||
end
|
||||
|
||||
%% ── Application ───────────────────────────────────────────────────────
|
||||
subgraph APP["Application"]
|
||||
CD["Caddy :443/:80\ncustom build + caddy-ratelimit\nauto-HTTPS · security headers\nrate limiting · error pages"]
|
||||
BE[Backend API :8080\nGo HTTP server]
|
||||
RN[Runner :9091\nGo background worker\n/metrics endpoint]
|
||||
UI[SvelteKit UI :3000\nadapter-node]
|
||||
end
|
||||
|
||||
%% ── Ops ───────────────────────────────────────────────────────────────
|
||||
subgraph OPS["Ops"]
|
||||
WT[Watchtower\npolls every 5 min\nautopull + redeploy\nbackend · runner · ui]
|
||||
end
|
||||
|
||||
%% ── Init → Storage ────────────────────────────────────────────────────
|
||||
MI -.->|create buckets| MN
|
||||
PI -.->|bootstrap schema| PB
|
||||
|
||||
%% ── App → Storage ─────────────────────────────────────────────────────
|
||||
BE -->|blobs| MN
|
||||
BE -->|structured records| PB
|
||||
BE -->|cache presigned URLs| VK
|
||||
RN -->|chapter markdown & audio| MN
|
||||
RN -->|read/update jobs & books| PB
|
||||
RN -->|index books on scrape| MS
|
||||
UI -->|read presign cache| VK
|
||||
UI -->|auth · progress · comments| PB
|
||||
|
||||
%% ── App internal ──────────────────────────────────────────────────────
|
||||
UI -->|"REST API (server-side)\n/api/catalogue /api/book-preview\n/api/chapter-text /api/audio"| BE
|
||||
|
||||
%% ── Caddy routing ─────────────────────────────────────────────────────
|
||||
CD -->|"/* catch-all\n/api/scrape/*\n/api/chapter-text-preview/*\n→ SvelteKit (auth enforced)"| UI
|
||||
CD -->|"/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/*"| BE
|
||||
CD -->|/avatars/* presigned GETs| MN
|
||||
|
||||
%% ── Runner → External ─────────────────────────────────────────────────
|
||||
RN -->|scrape HTTP GET| NF
|
||||
RN -->|TTS HTTP POST| KK
|
||||
CD -->|ACME certificate| LE
|
||||
|
||||
%% ── Ops ───────────────────────────────────────────────────────────────
|
||||
WT -->|watch label-enabled| BE
|
||||
WT -->|watch label-enabled| RN
|
||||
WT -->|watch label-enabled| UI
|
||||
|
||||
%% ── Client ────────────────────────────────────────────────────────────
|
||||
CL -->|HTTPS :443 single entry| CD
|
||||
```
|
||||
102
docs/mermaid/data-flow.mermaid.md
Normal file
102
docs/mermaid/data-flow.mermaid.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Data Flow — Scrape & TTS Job Pipeline
|
||||
|
||||
How content moves from novelfire.net through the runner into storage, and how
|
||||
audio is generated on-demand via the backend.
|
||||
|
||||
## Catalogue Scrape Pipeline
|
||||
|
||||
The runner performs a background catalogue walk on startup and then on a
|
||||
configurable interval (`RUNNER_CATALOGUE_REFRESH_INTERVAL`, default 24 h).
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A([Runner starts / refresh tick]) --> B[Walk novelfire.net catalogue\npages 1…N]
|
||||
B --> C{Book already\nin PocketBase?}
|
||||
C -- no --> D[Scrape book metadata\ntitle · author · genres\ncover · summary · status]
|
||||
C -- yes --> E[Check for new chapters\ncompare total_chapters]
|
||||
D --> F[Write BookMeta\nto PocketBase books]
|
||||
E --> G{New chapters\nfound?}
|
||||
G -- no --> Z([Done — next book])
|
||||
G -- yes --> H
|
||||
F --> H[Scrape chapter list with upTo limit\n→ chapters_idx in PocketBase\nretries on 429 with Retry-After backoff]
|
||||
H --> I[Worker pool — N goroutines\nRUNNER_MAX_CONCURRENT_SCRAPE]
|
||||
I --> J[For each missing chapter:\nGET chapter HTML from novelfire.net]
|
||||
J --> K[Parse HTML → Markdown\nhtmlutil.NodeToMarkdown]
|
||||
K --> L[PUT object to MinIO\nchapters/{slug}/{n}.md]
|
||||
L --> M[Upsert book doc\nto Meilisearch index: books]
|
||||
M --> Z
|
||||
F --> M
|
||||
```
|
||||
|
||||
## On-Demand Single-Book Scrape
|
||||
|
||||
Triggered when a user visits `/books/{slug}` and the book is not in PocketBase.
|
||||
The UI calls `GET /api/book-preview/{slug}` → backend enqueues a scrape task.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
actor U as User
|
||||
participant UI as SvelteKit UI
|
||||
participant BE as Backend API
|
||||
participant TQ as Task Queue (PocketBase)
|
||||
participant RN as Runner
|
||||
participant NF as novelfire.net
|
||||
participant PB as PocketBase
|
||||
participant MN as MinIO
|
||||
participant MS as Meilisearch
|
||||
|
||||
U->>UI: Visit /books/{slug}
|
||||
UI->>BE: GET /api/book-preview/{slug}
|
||||
BE->>PB: getBook(slug) — not found
|
||||
BE->>TQ: INSERT scrape_task (slug, status=pending)
|
||||
BE-->>UI: 202 {task_id, message}
|
||||
UI-->>U: "Scraping…" placeholder
|
||||
|
||||
RN->>TQ: Poll for pending tasks
|
||||
TQ-->>RN: scrape_task (slug)
|
||||
RN->>NF: GET novelfire.net/book/{slug}
|
||||
NF-->>RN: HTML
|
||||
RN->>PB: upsert book + chapters_idx
|
||||
RN->>MN: PUT chapter objects
|
||||
RN->>MS: UpsertBook doc
|
||||
RN->>TQ: UPDATE task status=done
|
||||
|
||||
U->>UI: Poll GET /api/scrape/tasks/{task_id}
|
||||
UI->>BE: GET /api/scrape/status
|
||||
BE->>TQ: get task
|
||||
TQ-->>BE: status=done
|
||||
BE-->>UI: {status:"done"}
|
||||
UI-->>U: Redirect to /books/{slug}
|
||||
```
|
||||
|
||||
## TTS Audio Generation Pipeline
|
||||
|
||||
Audio is generated lazily: on first request the job is enqueued; subsequent
|
||||
requests poll for completion and then stream from MinIO via presigned URL.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A(["POST /api/audio/{slug}/{n}\nbody: voice=af_bella"]) --> B{Audio already\nin MinIO?}
|
||||
B -- yes --> C[200 status: done]
|
||||
B -- no --> D{Job already\nin queue?}
|
||||
D -- "yes pending/generating" --> E[202 task_id + status]
|
||||
D -- no --> F[INSERT audio_task\nstatus=pending\nin PocketBase]
|
||||
F --> E
|
||||
|
||||
G([Runner polls task queue]) --> H[Claim audio_task\nstatus=generating]
|
||||
H --> I["GET /api/chapter-text/{slug}/{n}\nfrom backend — plain text"]
|
||||
I --> J[POST /v1/audio/speech\nto Kokoro-FastAPI\nbody: text + voice]
|
||||
J --> K[Stream MP3 response]
|
||||
K --> L[PUT object to MinIO\naudio/{slug}/{n}/{voice}.mp3]
|
||||
L --> M[UPDATE audio_task\nstatus=done]
|
||||
|
||||
N(["Client polls\nGET /api/audio/status/{slug}/{n}"]) --> O{status?}
|
||||
O -- "pending/generating" --> N
|
||||
O -- done --> P["GET /api/presign/audio/{slug}/{n}"]
|
||||
P --> Q{Valkey cache hit?}
|
||||
Q -- yes --> R[302 → presigned URL]
|
||||
Q -- no --> S[GeneratePresignedURL\nfrom MinIO — TTL 1h]
|
||||
S --> T[Cache in Valkey\nTTL 3500s]
|
||||
T --> R
|
||||
R --> U([Client streams audio\ndirectly from MinIO])
|
||||
```
|
||||
111
docs/mermaid/request-flow.mermaid.md
Normal file
111
docs/mermaid/request-flow.mermaid.md
Normal file
@@ -0,0 +1,111 @@
|
||||
# Request Flow
|
||||
|
||||
Two representative request paths through the stack: a **page load** (SSR) and a
|
||||
**media playback** (presigned URL → direct MinIO stream).
|
||||
|
||||
## SSR Page Load — Catalogue / Book Detail
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
actor C as Browser / iOS App
|
||||
participant CD as Caddy :443
|
||||
participant UI as SvelteKit UI :3000
|
||||
participant BE as Backend API :8080
|
||||
participant MS as Meilisearch :7700
|
||||
participant PB as PocketBase :8090
|
||||
participant VK as Valkey :6379
|
||||
|
||||
C->>CD: HTTPS GET /catalogue
|
||||
CD->>UI: proxy /* (SvelteKit catch-all)
|
||||
UI->>BE: GET /api/catalogue?page=1&sort=popular
|
||||
BE->>MS: search(query, filters, sort)
|
||||
MS-->>BE: [{slug, title, …}, …]
|
||||
BE-->>UI: {books[], page, total, has_next}
|
||||
UI-->>CD: SSR HTML
|
||||
CD-->>C: 200 HTML
|
||||
|
||||
Note over C,UI: Infinite scroll — client fetches next page via SvelteKit API route
|
||||
C->>CD: HTTPS GET /api/catalogue-page?page=2
|
||||
CD->>UI: proxy /* (SvelteKit /api/catalogue-page server route)
|
||||
UI->>BE: GET /api/catalogue?page=2
|
||||
BE->>MS: search(…)
|
||||
MS-->>BE: next page
|
||||
BE-->>UI: {books[], …}
|
||||
UI-->>C: JSON
|
||||
```
|
||||
|
||||
## Audio Playback — Presigned URL Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
actor C as Browser / iOS App
|
||||
participant CD as Caddy :443
|
||||
participant UI as SvelteKit UI :3000
|
||||
participant BE as Backend API :8080
|
||||
participant VK as Valkey :6379
|
||||
participant MN as MinIO :9000
|
||||
|
||||
C->>CD: GET /api/presign/audio/{slug}/{n}?voice=af_bella
|
||||
CD->>UI: proxy /* (SvelteKit /api/presign/audio route)
|
||||
UI->>BE: GET /api/presign/audio/{slug}/{n}?voice=af_bella
|
||||
BE->>VK: GET presign:audio:{slug}:{n}:{voice}
|
||||
alt cache hit
|
||||
VK-->>BE: presigned URL (TTL remaining)
|
||||
BE-->>UI: 302 redirect → presigned URL
|
||||
UI-->>C: 302 redirect
|
||||
else cache miss
|
||||
BE->>MN: GeneratePresignedURL(audio-bucket, key, 1h)
|
||||
MN-->>BE: presigned URL
|
||||
BE->>VK: SET presign:audio:… EX 3500
|
||||
BE-->>UI: 302 redirect → presigned URL
|
||||
UI-->>C: 302 redirect
|
||||
end
|
||||
C->>MN: GET presigned URL (direct, no proxy)
|
||||
MN-->>C: audio/mpeg stream
|
||||
```
|
||||
|
||||
## Chapter Read — SSR + Content Fetch
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
actor C as Browser / iOS App
|
||||
participant CD as Caddy :443
|
||||
participant UI as SvelteKit UI :3000
|
||||
participant BE as Backend API :8080
|
||||
participant PB as PocketBase :8090
|
||||
participant MN as MinIO :9000
|
||||
|
||||
C->>CD: HTTPS GET /books/{slug}/chapters/{n}
|
||||
CD->>UI: proxy /* (SvelteKit catch-all)
|
||||
UI->>PB: getBook(slug) + listChapterIdx(slug)
|
||||
PB-->>UI: book meta + chapter list
|
||||
UI->>BE: GET /api/chapter-text/{slug}/{n}
|
||||
BE->>MN: GetObject(chapters-bucket, {slug}/{n}.md)
|
||||
MN-->>BE: markdown text
|
||||
BE-->>UI: plain text (markdown stripped)
|
||||
Note over UI: marked() → HTML
|
||||
UI-->>CD: SSR HTML
|
||||
CD-->>C: 200 HTML
|
||||
```
|
||||
|
||||
## Caddy Request Lifecycle
|
||||
|
||||
Shows how security hardening applies before a request reaches any upstream.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A([Incoming HTTPS request]) --> B[TLS termination\nLet's Encrypt cert]
|
||||
B --> C{Rate limit check\ncaddy-ratelimit}
|
||||
C -- over limit --> D[429 Too Many Requests]
|
||||
C -- ok --> E[Add security headers\nX-Frame-Options · X-Content-Type-Options\nReferrer-Policy · Permissions-Policy\nHSTS · X-XSS-Protection\nremove Server header]
|
||||
E --> F{Route match}
|
||||
F -- "/health /scrape*\n/api/browse /api/book-preview/*\n/api/chapter-text/*\n/api/reindex/* /api/cover/*\n/api/audio-proxy/*" --> G[reverse_proxy → backend:8080]
|
||||
F -- "/avatars/*" --> H[reverse_proxy → minio:9000]
|
||||
F -- "/* everything else\n(incl. /api/scrape/*\n/api/chapter-text-preview/*)" --> I[reverse_proxy → ui:3000\nSvelteKit auth middleware runs]
|
||||
G --> J{Upstream healthy?}
|
||||
H --> J
|
||||
I --> J
|
||||
J -- yes --> K([Response to client])
|
||||
J -- "502/503/504" --> L[handle_errors\nstatic HTML from /srv/errors/]
|
||||
L --> K
|
||||
```
|
||||
5
dozzle/users.yml
Normal file
5
dozzle/users.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
users:
|
||||
admin:
|
||||
name: admin
|
||||
email: admin@libnovel.cc
|
||||
password: "$2y$10$4jqLza2grpxnQn0EGux2C.UmlSxRmOvH/J1ySzOBxMZgW6cA2TnmK"
|
||||
14
ios/.gitignore
vendored
Normal file
14
ios/.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# Xcode build artifacts — regenerate with: xcodegen generate --spec project.yml
|
||||
xcuserdata/
|
||||
*.xcuserstate
|
||||
*.xcworkspace/xcuserdata/
|
||||
DerivedData/
|
||||
build/
|
||||
|
||||
# Swift Package Manager — resolved by Xcode on first open
|
||||
LibNovel.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/
|
||||
.build/
|
||||
# Package.resolved is committed so SPM builds are reproducible
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
87
ios/AGENTS.md
Normal file
87
ios/AGENTS.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# LibNovel iOS App
|
||||
|
||||
SwiftUI app targeting iOS 17+. Consumes the Go scraper HTTP API for books, chapters, and audio. Uses MinIO presigned URLs for media playback and downloads.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
ios/LibNovel/LibNovel/
|
||||
├── App/ # LibNovelApp.swift, ContentView.swift, RootTabView.swift
|
||||
├── Models/ # Models.swift (all domain types)
|
||||
├── Networking/ # APIClient.swift (URLSession-based HTTP client)
|
||||
├── Services/ # AudioPlayerService, AudioDownloadService, AuthStore,
|
||||
│ # BookVoicePreferences, NetworkMonitor
|
||||
├── ViewModels/ # One per view/feature (HomeViewModel, BrowseViewModel, etc.)
|
||||
├── Views/
|
||||
│ ├── Auth/ # AuthView
|
||||
│ ├── BookDetail/ # BookDetailView, CommentsView
|
||||
│ ├── Browse/ # BrowseView (infinite scroll shelves)
|
||||
│ ├── ChapterReader/ # ChapterReaderView, DownloadAudioButton
|
||||
│ ├── Common/ # CommonViews (shared reusable components)
|
||||
│ ├── Components/ # OfflineBanner
|
||||
│ ├── Downloads/ # DownloadsView, DownloadQueueButton
|
||||
│ ├── Home/ # HomeView
|
||||
│ ├── Library/ # LibraryView (2-col grid, filters)
|
||||
│ ├── Player/ # PlayerViews (floating FAB, compact, full-screen)
|
||||
│ ├── Profile/ # ProfileView, VoiceSelectionView, UserProfileView, etc.
|
||||
│ └── Search/ # SearchView
|
||||
└── Extensions/ # NavDestination.swift, String+App.swift, Color+App.swift
|
||||
```
|
||||
|
||||
## iOS / Swift Conventions
|
||||
|
||||
- **Deployment target**: iOS 17.0 — use iOS 17+ APIs freely.
|
||||
- **Observable pattern**: The codebase currently uses `@StateObject` / `ObservableObject` / `@Published`. When adding new types, prefer the **`@Observable` macro** (iOS 17+) over `ObservableObject`. Do not refactor existing types unless explicitly asked.
|
||||
- **Navigation**: Use `NavigationStack` (not `NavigationView`). Use `.navigationDestination(for:)` for type-safe routing.
|
||||
- **Concurrency**: Use `async/await` and structured concurrency. Avoid callback-based APIs and `DispatchQueue.main.async` — prefer `@MainActor` or `await MainActor.run`.
|
||||
- **State management**: Prefer `@State` + `@Binding` for local UI state. Use environment objects for app-wide services (authStore, audioPlayer, downloadService, networkMonitor).
|
||||
- **SwiftData**: Not currently used. Do not introduce SwiftData without discussion.
|
||||
- **SF Symbols**: Use `Image(systemName:)` for icons. No emoji in UI unless already present.
|
||||
|
||||
## Key Patterns
|
||||
|
||||
- **Download keys**: Use `::` as separator (e.g., `"slug::chapter-1::voice"`), never `-`. Slugs contain hyphens.
|
||||
- **Voice fallback chain**: book override → global default → `"af_bella"`. See `BookVoicePreferences.voiceWithFallback()`.
|
||||
- **Offline handling**: Wrap view bodies in `VStack` with `OfflineBanner` at top. Use `NetworkMonitor` (environment object) to gate network calls. Suppress network errors silently when offline via `ErrorAlertModifier`.
|
||||
- **Audio playback priority**: local file → MinIO presigned URL → trigger TTS generation.
|
||||
- **Progress display**: Show decimal % when < 10% (e.g., "3.4%"), rounded when >= 10% (e.g., "47%").
|
||||
- **Cover images**: Always proxy via `/api/cover/{domain}/{slug}` — never link directly to source.
|
||||
|
||||
## Networking
|
||||
|
||||
`APIClient.swift` wraps all Go scraper API calls. When adding new endpoints:
|
||||
|
||||
1. Add a method to `APIClient`.
|
||||
2. Keep error handling consistent — throw typed errors, let ViewModels catch and set `errorMessage`.
|
||||
3. All requests are relative to `SCRAPER_API_URL` (configured at build time via xcconfig or environment).
|
||||
|
||||
## Using Documentation Tools
|
||||
|
||||
When writing or reviewing SwiftUI/Swift code:
|
||||
|
||||
- Use `context7` to look up current Apple SwiftUI/Swift documentation before implementing anything non-trivial. Apple's APIs evolve fast — do not rely on training data alone.
|
||||
- Use `gh_grep` to find real-world Swift patterns when unsure how something is typically implemented.
|
||||
|
||||
Example prompts:
|
||||
- "How does `.searchable` work in iOS 17? use context7"
|
||||
- "Show me examples of `@Observable` with async tasks. use context7"
|
||||
- "How do other apps implement background URLSession downloads in Swift? use gh_grep"
|
||||
|
||||
## UI/UX Skill
|
||||
|
||||
For any iOS view work, always load the `ios-ux` skill at the start of the task:
|
||||
|
||||
```
|
||||
skill({ name: "ios-ux" })
|
||||
```
|
||||
|
||||
This skill defines the full design system, animation rules, haptic feedback policy, accessibility checklist, performance guidelines, and offline handling requirements. It also governs how to handle screenshot-based reviews (analyze → suggest → confirm before applying).
|
||||
|
||||
## What to Avoid
|
||||
|
||||
- `NavigationView` — deprecated, use `NavigationStack`
|
||||
- `ObservableObject` / `@Published` for new types — prefer `@Observable`
|
||||
- `DispatchQueue.main.async` — prefer `@MainActor`
|
||||
- Force unwrapping optionals
|
||||
- Hardcoded color literals — use `Color+App.swift` extensions or semantic colors
|
||||
- Adding new dependencies (SPM packages) without discussion
|
||||
7
ios/LibNovelV2/LibNovelV2.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
ios/LibNovelV2/LibNovelV2.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Workspace
|
||||
version = "1.0">
|
||||
<FileRef
|
||||
location = "self:">
|
||||
</FileRef>
|
||||
</Workspace>
|
||||
129
justfile
Normal file
129
justfile
Normal file
@@ -0,0 +1,129 @@
|
||||
# ── LibNovel v3 — justfile ────────────────────────────────────────────────────
|
||||
# All commands that touch docker-compose are wrapped with `doppler run` so that
|
||||
# secrets are injected into the environment at runtime — no .env files needed.
|
||||
#
|
||||
# Prerequisites:
|
||||
# brew install doppler just
|
||||
# doppler setup (run once; selects project=libnovel config=prd)
|
||||
#
|
||||
# Usage:
|
||||
# just up # start all services (detached)
|
||||
# just down # stop all services
|
||||
# just logs # tail all service logs
|
||||
# just ps # show running containers
|
||||
# just build # rebuild backend + ui images
|
||||
# just restart # full stop + start cycle
|
||||
# just secrets # print all injected secrets (debug)
|
||||
|
||||
set dotenv-load := false # Doppler handles all env; never load a .env file
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# Inject secrets from Doppler, then run the given command
|
||||
doppler := "doppler run --"
|
||||
|
||||
# ── Core compose commands ─────────────────────────────────────────────────────
|
||||
|
||||
# Start all services in the background
|
||||
up:
|
||||
{{doppler}} docker compose up -d
|
||||
|
||||
# Start and stream logs (foreground)
|
||||
up-fg:
|
||||
{{doppler}} docker compose up
|
||||
|
||||
# Stop all running services
|
||||
down:
|
||||
{{doppler}} docker compose down
|
||||
|
||||
# Stop and remove volumes (full reset — destructive!)
|
||||
down-volumes:
|
||||
{{doppler}} docker compose down -v
|
||||
|
||||
# Show service status
|
||||
ps:
|
||||
{{doppler}} docker compose ps
|
||||
|
||||
# ── Build & publish ───────────────────────────────────────────────────────────
|
||||
|
||||
# Build (or rebuild) all custom images locally
|
||||
build:
|
||||
{{doppler}} docker compose build
|
||||
|
||||
# Build a specific service, e.g.: just build-svc backend
|
||||
build-svc svc:
|
||||
{{doppler}} docker compose build {{svc}}
|
||||
|
||||
# Push all custom images to Docker Hub (requires docker login)
|
||||
push:
|
||||
{{doppler}} docker compose push backend runner ui caddy
|
||||
|
||||
# Build then push all custom images
|
||||
build-push: build push
|
||||
|
||||
# Pull all images from Docker Hub (uses GIT_TAG from Doppler)
|
||||
pull-images:
|
||||
{{doppler}} docker compose pull backend runner ui caddy
|
||||
|
||||
# Pull all third-party base images (minio, pocketbase, etc.)
|
||||
pull-infra:
|
||||
{{doppler}} docker compose pull minio pocketbase meilisearch valkey postgres crowdsec watchtower
|
||||
|
||||
# ── Logs ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Tail all service logs (last 50 lines + follow)
|
||||
logs:
|
||||
{{doppler}} docker compose logs -f --tail=50
|
||||
|
||||
# Tail a specific service, e.g.: just log backend
|
||||
log svc:
|
||||
{{doppler}} docker compose logs -f --tail=50 {{svc}}
|
||||
|
||||
# ── Lifecycle ─────────────────────────────────────────────────────────────────
|
||||
|
||||
# Full restart: stop then start
|
||||
restart: down up
|
||||
|
||||
# Restart a single service, e.g.: just restart-svc backend
|
||||
restart-svc svc:
|
||||
{{doppler}} docker compose restart {{svc}}
|
||||
|
||||
# Pull → build → recreate (rolling update without clearing volumes)
|
||||
update:
|
||||
{{doppler}} docker compose pull
|
||||
{{doppler}} docker compose build
|
||||
{{doppler}} docker compose up -d
|
||||
|
||||
# ── Initialisation ────────────────────────────────────────────────────────────
|
||||
|
||||
# Run one-shot init containers (minio-init, pb-init, postgres-init)
|
||||
init:
|
||||
{{doppler}} docker compose run --rm minio-init
|
||||
{{doppler}} docker compose run --rm pb-init
|
||||
{{doppler}} docker compose run --rm postgres-init
|
||||
|
||||
# ── Shell access ──────────────────────────────────────────────────────────────
|
||||
|
||||
# Open a shell in a running service, e.g.: just shell backend
|
||||
shell svc:
|
||||
{{doppler}} docker compose exec {{svc}} sh
|
||||
|
||||
# ── Secrets ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# Print all secrets Doppler will inject (never redirected to a file)
|
||||
secrets:
|
||||
doppler secrets --project libnovel --config prd
|
||||
|
||||
# Print secrets as a .env-formatted list (useful for debugging)
|
||||
secrets-env:
|
||||
doppler secrets download --project libnovel --config prd --format env --no-file
|
||||
|
||||
# Open Doppler dashboard in browser
|
||||
secrets-dashboard:
|
||||
doppler open dashboard
|
||||
|
||||
# ── Gitea CI ──────────────────────────────────────────────────────────────────
|
||||
|
||||
# Validate workflow files
|
||||
ci-lint:
|
||||
actionlint .gitea/workflows/*.yaml
|
||||
12
opencode.json
Normal file
12
opencode.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"$schema": "https://opencode.ai/config.json",
|
||||
"mcp": {
|
||||
"gh_grep": {
|
||||
"type": "remote",
|
||||
"url": "https://mcp.grep.app",
|
||||
"enabled": true
|
||||
}
|
||||
},
|
||||
"instructions": [
|
||||
]
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
# ── Build stage ────────────────────────────────────────────────────────────────
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Cache dependency downloads separately from source compilation.
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
|
||||
go build -ldflags="-s -w" -o /scraper ./cmd/scraper
|
||||
|
||||
# ── Runtime stage ──────────────────────────────────────────────────────────────
|
||||
FROM alpine:3.20
|
||||
|
||||
# ca-certificates is required for HTTPS requests to novelfire.net.
|
||||
RUN apk add --no-cache ca-certificates tzdata
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=builder /scraper /app/scraper
|
||||
|
||||
# Create the default static output directory.
|
||||
RUN mkdir -p /app/static/books
|
||||
|
||||
# Non-root user.
|
||||
RUN addgroup -S scraper && adduser -S scraper -G scraper
|
||||
RUN chown -R scraper:scraper /app
|
||||
USER scraper
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||||
ENV BROWSERLESS_URL=http://browserless:3030
|
||||
ENV BROWSERLESS_STRATEGY=content
|
||||
ENV SCRAPER_WORKERS=0
|
||||
ENV SCRAPER_STATIC_ROOT=/app/static/books
|
||||
ENV SCRAPER_HTTP_ADDR=:8080
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
# Default: run as an HTTP server. Override CMD to use "run" for one-shot.
|
||||
ENTRYPOINT ["/app/scraper"]
|
||||
CMD ["serve"]
|
||||
@@ -1,217 +0,0 @@
|
||||
// Command scraper is the entrypoint for the libnovel scraper service.
|
||||
//
|
||||
// Usage (CLI one-shot):
|
||||
//
|
||||
// scraper run [--url <book-url>]
|
||||
//
|
||||
// Usage (HTTP server):
|
||||
//
|
||||
// scraper serve
|
||||
//
|
||||
// Environment variables:
|
||||
//
|
||||
// BROWSERLESS_URL Browserless base URL (default: http://localhost:3030)
|
||||
// BROWSERLESS_TOKEN Browserless API token (default: "")
|
||||
// BROWSERLESS_STRATEGY content | scrape | cdp (default: content)
|
||||
// BROWSERLESS_MAX_CONCURRENT Max simultaneous browser sessions (default: 5)
|
||||
// SCRAPER_WORKERS Chapter goroutine count (default: NumCPU)
|
||||
// SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
// SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
// KOKORO_URL Kokoro-FastAPI base URL (default: "")
|
||||
// KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
// LOG_LEVEL debug | info | warn | error (default: info)
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/novelfire"
|
||||
"github.com/libnovel/scraper/internal/orchestrator"
|
||||
"github.com/libnovel/scraper/internal/server"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
func main() {
|
||||
logLevel := slog.LevelInfo
|
||||
if v := os.Getenv("LOG_LEVEL"); v != "" {
|
||||
if err := logLevel.UnmarshalText([]byte(v)); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "invalid LOG_LEVEL %q, using info\n", v)
|
||||
}
|
||||
}
|
||||
log := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: logLevel,
|
||||
}))
|
||||
|
||||
if err := run(log); err != nil {
|
||||
log.Error("fatal", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run(log *slog.Logger) error {
|
||||
args := os.Args[1:]
|
||||
if len(args) == 0 {
|
||||
printUsage()
|
||||
return nil
|
||||
}
|
||||
|
||||
cmd := strings.ToLower(args[0])
|
||||
|
||||
browserCfg := browser.Config{
|
||||
BaseURL: envOr("BROWSERLESS_URL", "http://localhost:3030"),
|
||||
Token: envOr("BROWSERLESS_TOKEN", ""),
|
||||
}
|
||||
browserCfg.MaxConcurrent = 5
|
||||
if s := os.Getenv("BROWSERLESS_MAX_CONCURRENT"); s != "" {
|
||||
if n, err := strconv.Atoi(s); err == nil && n > 0 {
|
||||
browserCfg.MaxConcurrent = n
|
||||
}
|
||||
}
|
||||
if s := os.Getenv("BROWSERLESS_TIMEOUT"); s != "" {
|
||||
if n, err := strconv.Atoi(s); err == nil && n > 0 {
|
||||
browserCfg.Timeout = time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
strategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_STRATEGY", string(browser.StrategyDirect))))
|
||||
urlStrategy := browser.Strategy(strings.ToLower(envOr("BROWSERLESS_URL_STRATEGY", string(browser.StrategyContent))))
|
||||
bc := newBrowserClient(strategy, browserCfg)
|
||||
urlClient := newBrowserClient(urlStrategy, browserCfg)
|
||||
|
||||
staticRoot := envOr("SCRAPER_STATIC_ROOT", "./static/books")
|
||||
w := writer.New(staticRoot)
|
||||
nf := novelfire.New(bc, log, urlClient, w)
|
||||
|
||||
workers := 0
|
||||
if s := os.Getenv("SCRAPER_WORKERS"); s != "" {
|
||||
n, err := strconv.Atoi(s)
|
||||
if err == nil && n > 0 {
|
||||
workers = n
|
||||
}
|
||||
}
|
||||
if workers == 0 {
|
||||
workers = runtime.NumCPU()
|
||||
}
|
||||
|
||||
oCfg := orchestrator.Config{
|
||||
Workers: workers,
|
||||
StaticRoot: staticRoot,
|
||||
}
|
||||
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
switch cmd {
|
||||
case "run":
|
||||
// Optional --url flag.
|
||||
if len(args) >= 3 && args[1] == "--url" {
|
||||
oCfg.SingleBookURL = args[2]
|
||||
}
|
||||
log.Info("starting one-shot scrape",
|
||||
"strategy", strategy,
|
||||
"workers", workers,
|
||||
"max_concurrent", browserCfg.MaxConcurrent,
|
||||
"static_root", oCfg.StaticRoot,
|
||||
"single_book", oCfg.SingleBookURL,
|
||||
)
|
||||
o := orchestrator.New(oCfg, nf, log)
|
||||
return o.Run(ctx)
|
||||
|
||||
case "refresh":
|
||||
// refresh <slug> - re-scrape a book from its saved source_url
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("refresh command requires a book slug argument")
|
||||
}
|
||||
slug := args[1]
|
||||
w := writer.New(oCfg.StaticRoot)
|
||||
meta, ok, err := w.ReadMetadata(slug)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read metadata for %s: %w", slug, err)
|
||||
}
|
||||
if !ok {
|
||||
return fmt.Errorf("book %q not found in %s", slug, oCfg.StaticRoot)
|
||||
}
|
||||
if meta.SourceURL == "" {
|
||||
return fmt.Errorf("book %q has no source_url in metadata", slug)
|
||||
}
|
||||
oCfg.SingleBookURL = meta.SourceURL
|
||||
log.Info("refreshing book from source_url",
|
||||
"slug", slug,
|
||||
"source_url", meta.SourceURL,
|
||||
)
|
||||
o := orchestrator.New(oCfg, nf, log)
|
||||
return o.Run(ctx)
|
||||
|
||||
case "serve":
|
||||
addr := envOr("SCRAPER_HTTP_ADDR", ":8080")
|
||||
kokoroURL := envOr("KOKORO_URL", "")
|
||||
kokoroVoice := envOr("KOKORO_VOICE", "af_bella")
|
||||
log.Info("starting HTTP server",
|
||||
"addr", addr,
|
||||
"strategy", strategy,
|
||||
"workers", workers,
|
||||
"max_concurrent", browserCfg.MaxConcurrent,
|
||||
"kokoro_url", kokoroURL,
|
||||
"kokoro_voice", kokoroVoice,
|
||||
)
|
||||
srv := server.New(addr, oCfg, nf, log, kokoroURL, kokoroVoice)
|
||||
return srv.ListenAndServe(ctx)
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown command %q; use 'run' or 'serve'", cmd)
|
||||
}
|
||||
}
|
||||
|
||||
func newBrowserClient(strategy browser.Strategy, cfg browser.Config) browser.BrowserClient {
|
||||
switch strategy {
|
||||
case browser.StrategyScrape:
|
||||
return browser.NewScrapeClient(cfg)
|
||||
case browser.StrategyCDP:
|
||||
return browser.NewCDPClient(cfg)
|
||||
case browser.StrategyDirect:
|
||||
return browser.NewDirectHTTPClient(cfg)
|
||||
default:
|
||||
return browser.NewContentClient(cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func envOr(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
fmt.Fprintf(os.Stderr, `libnovel scraper
|
||||
|
||||
Commands:
|
||||
run [--url <book-url>] One-shot: scrape full catalogue, or a single book
|
||||
refresh <slug> Re-scrape a book from its saved source_url
|
||||
serve Start HTTP server (POST /scrape, POST /scrape/book)
|
||||
|
||||
Environment variables:
|
||||
BROWSERLESS_URL Browserless base URL (default: http://localhost:3030)
|
||||
BROWSERLESS_TOKEN API token (default: "")
|
||||
BROWSERLESS_STRATEGY content|scrape|cdp|direct (default: direct)
|
||||
BROWSERLESS_URL_STRATEGY Strategy for URL retrieval (default: content)
|
||||
BROWSERLESS_MAX_CONCURRENT Max simultaneous sessions (default: 5)
|
||||
BROWSERLESS_TIMEOUT HTTP request timeout sec (default: 90)
|
||||
SCRAPER_WORKERS Chapter goroutines (default: NumCPU = %d)
|
||||
SCRAPER_STATIC_ROOT Output directory (default: ./static/books)
|
||||
SCRAPER_HTTP_ADDR HTTP listen address (default: :8080)
|
||||
KOKORO_URL Kokoro-FastAPI base URL (default: "", TTS disabled)
|
||||
KOKORO_VOICE Default TTS voice (default: af_bella)
|
||||
LOG_LEVEL debug|info|warn|error (default: info)
|
||||
`, runtime.NumCPU())
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
module github.com/libnovel/scraper
|
||||
|
||||
go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/yuin/goldmark v1.7.16 // indirect
|
||||
golang.org/x/exp/typeparams v0.0.0-20231108232855-2478ac86f678 // indirect
|
||||
golang.org/x/mod v0.31.0 // indirect
|
||||
golang.org/x/net v0.51.0 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/tools v0.40.1-0.20260108161641-ca281cf95054 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
honnef.co/go/tools v0.7.0 // indirect
|
||||
)
|
||||
|
||||
tool honnef.co/go/tools/cmd/staticcheck
|
||||
@@ -1,22 +0,0 @@
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c h1:pxW6RcqyfI9/kWtOwnv/G+AzdKuy2ZrqINhenH4HyNs=
|
||||
github.com/BurntSushi/toml v1.4.1-0.20240526193622-a339e1f7089c/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE=
|
||||
github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
|
||||
golang.org/x/exp/typeparams v0.0.0-20231108232855-2478ac86f678 h1:1P7xPZEwZMoBoz0Yze5Nx2/4pxj6nw9ZqHWXqP0iRgQ=
|
||||
golang.org/x/exp/typeparams v0.0.0-20231108232855-2478ac86f678/go.mod h1:AbB0pIl9nAr9wVwH+Z2ZpaocVmF5I4GyWCDIsVjR0bk=
|
||||
golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
|
||||
golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/tools v0.40.1-0.20260108161641-ca281cf95054 h1:CHVDrNHx9ZoOrNN9kKWYIbT5Rj+WF2rlwPkhbQQ5V4U=
|
||||
golang.org/x/tools v0.40.1-0.20260108161641-ca281cf95054/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
honnef.co/go/tools v0.7.0 h1:w6WUp1VbkqPEgLz4rkBzH/CSU6HkoqNLp6GstyTx3lU=
|
||||
honnef.co/go/tools v0.7.0/go.mod h1:pm29oPxeP3P82ISxZDgIYeOaf9ta6Pi0EWvCFoLG2vc=
|
||||
@@ -1,137 +0,0 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
// cdpClient implements BrowserClient using the CDP WebSocket endpoint.
|
||||
type cdpClient struct {
|
||||
cfg Config
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewCDPClient returns a BrowserClient that uses CDP WebSocket sessions.
|
||||
func NewCDPClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 60 * time.Second
|
||||
}
|
||||
return &cdpClient{cfg: cfg, sem: makeSem(cfg.MaxConcurrent)}
|
||||
}
|
||||
|
||||
func (c *cdpClient) Strategy() Strategy { return StrategyCDP }
|
||||
|
||||
func (c *cdpClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
|
||||
return "", fmt.Errorf("CDP client does not support /content; use NewContentClient")
|
||||
}
|
||||
|
||||
func (c *cdpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
||||
return ScrapeResponse{}, fmt.Errorf("CDP client does not support /scrape; use NewScrapeClient")
|
||||
}
|
||||
|
||||
// CDPSession opens a WebSocket to the Browserless /devtools/browser endpoint,
|
||||
// navigates to pageURL, and invokes fn with a live CDPConn.
|
||||
func (c *cdpClient) CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return fmt.Errorf("cdp: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
// Build WebSocket URL: ws://host:port/devtools/browser?token=...&url=...
|
||||
wsURL := strings.Replace(c.cfg.BaseURL, "http://", "ws://", 1)
|
||||
wsURL = strings.Replace(wsURL, "https://", "wss://", 1)
|
||||
wsURL += "/devtools/browser"
|
||||
sep := "?"
|
||||
if c.cfg.Token != "" {
|
||||
wsURL += sep + "token=" + c.cfg.Token
|
||||
sep = "&"
|
||||
}
|
||||
wsURL += sep + "url=" + pageURL
|
||||
|
||||
dialer := websocket.Dialer{
|
||||
HandshakeTimeout: 15 * time.Second,
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
}
|
||||
|
||||
conn, _, err := dialer.DialContext(ctx, wsURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cdp: dial %s: %w", wsURL, err)
|
||||
}
|
||||
|
||||
cdp := &cdpConn{ws: conn}
|
||||
defer cdp.Close()
|
||||
|
||||
return fn(ctx, cdp)
|
||||
}
|
||||
|
||||
// ─── cdpConn ─────────────────────────────────────────────────────────────────
|
||||
|
||||
type cdpConn struct {
|
||||
ws *websocket.Conn
|
||||
counter atomic.Int64
|
||||
}
|
||||
|
||||
type cdpRequest struct {
|
||||
ID int64 `json:"id"`
|
||||
Method string `json:"method"`
|
||||
Params map[string]any `json:"params,omitempty"`
|
||||
}
|
||||
|
||||
type cdpResponse struct {
|
||||
ID int64 `json:"id"`
|
||||
Result map[string]any `json:"result,omitempty"`
|
||||
Error *struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
} `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
func (c *cdpConn) Send(ctx context.Context, method string, params map[string]any) (map[string]any, error) {
|
||||
id := c.counter.Add(1)
|
||||
|
||||
req := cdpRequest{ID: id, Method: method, Params: params}
|
||||
data, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cdp send: marshal: %w", err)
|
||||
}
|
||||
|
||||
if dl, ok := ctx.Deadline(); ok {
|
||||
_ = c.ws.SetWriteDeadline(dl)
|
||||
}
|
||||
if err := c.ws.WriteMessage(websocket.TextMessage, data); err != nil {
|
||||
return nil, fmt.Errorf("cdp send: write: %w", err)
|
||||
}
|
||||
|
||||
// Read messages until we find the response matching our id.
|
||||
for {
|
||||
if dl, ok := ctx.Deadline(); ok {
|
||||
_ = c.ws.SetReadDeadline(dl)
|
||||
}
|
||||
_, msg, err := c.ws.ReadMessage()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cdp send: read: %w", err)
|
||||
}
|
||||
var resp cdpResponse
|
||||
if err := json.Unmarshal(msg, &resp); err != nil {
|
||||
continue // skip non-JSON frames (events etc.)
|
||||
}
|
||||
if resp.ID != id {
|
||||
continue // event or different command reply
|
||||
}
|
||||
if resp.Error != nil {
|
||||
return nil, fmt.Errorf("cdp error %d: %s", resp.Error.Code, resp.Error.Message)
|
||||
}
|
||||
return resp.Result, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (c *cdpConn) Close() error {
|
||||
return c.ws.Close()
|
||||
}
|
||||
@@ -1,195 +0,0 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Config holds the connection parameters for a Browserless instance.
|
||||
type Config struct {
|
||||
// BaseURL is the HTTP base URL, e.g. "http://localhost:3030".
|
||||
BaseURL string
|
||||
// Token is the optional API token (BROWSERLESS_TOKEN env var).
|
||||
Token string
|
||||
// Timeout is the per-request HTTP timeout; defaults to 60 s.
|
||||
Timeout time.Duration
|
||||
// MaxConcurrent caps the number of simultaneous in-flight requests sent to
|
||||
// Browserless. When all slots are occupied new calls block until one
|
||||
// completes (or ctx is cancelled). 0 means no limit.
|
||||
MaxConcurrent int
|
||||
}
|
||||
|
||||
// makeSem returns a buffered channel used as a counting semaphore.
|
||||
// If n <= 0 a nil channel is returned, which causes acquire/release to be no-ops.
|
||||
func makeSem(n int) chan struct{} {
|
||||
if n <= 0 {
|
||||
return nil
|
||||
}
|
||||
return make(chan struct{}, n)
|
||||
}
|
||||
|
||||
// acquire takes one slot from sem. It returns an error if ctx is cancelled
|
||||
// before a slot becomes available. If sem is nil it returns immediately.
|
||||
func acquire(ctx context.Context, sem chan struct{}) error {
|
||||
if sem == nil {
|
||||
return nil
|
||||
}
|
||||
select {
|
||||
case sem <- struct{}{}:
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
// release frees the slot previously obtained by acquire.
|
||||
// If sem is nil it is a no-op.
|
||||
func release(sem chan struct{}) {
|
||||
if sem != nil {
|
||||
<-sem
|
||||
}
|
||||
}
|
||||
|
||||
// contentClient implements BrowserClient using the /content endpoint.
|
||||
type contentClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewContentClient returns a BrowserClient that uses POST /content.
|
||||
func NewContentClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 90 * time.Second
|
||||
}
|
||||
return &contentClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *contentClient) Strategy() Strategy { return StrategyContent }
|
||||
|
||||
func (c *contentClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return "", fmt.Errorf("content: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("content: marshal request: %w", err)
|
||||
}
|
||||
|
||||
url := c.cfg.BaseURL + "/content"
|
||||
if c.cfg.Token != "" {
|
||||
url += "?token=" + c.cfg.Token
|
||||
}
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("content: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("content: do request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("content: unexpected status %d: %s", resp.StatusCode, b)
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("content: read body: %w", err)
|
||||
}
|
||||
return string(raw), nil
|
||||
}
|
||||
|
||||
func (c *contentClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
||||
return ScrapeResponse{}, fmt.Errorf("content client does not support /scrape; use NewScrapeClient")
|
||||
}
|
||||
|
||||
func (c *contentClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
||||
return fmt.Errorf("content client does not support CDP; use NewCDPClient")
|
||||
}
|
||||
|
||||
// ─── /scrape client ───────────────────────────────────────────────────────────
|
||||
|
||||
type scrapeClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
// NewScrapeClient returns a BrowserClient that uses POST /scrape.
|
||||
func NewScrapeClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 90 * time.Second
|
||||
}
|
||||
return &scrapeClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *scrapeClient) Strategy() Strategy { return StrategyScrape }
|
||||
|
||||
func (c *scrapeClient) GetContent(_ context.Context, _ ContentRequest) (string, error) {
|
||||
return "", fmt.Errorf("scrape client does not support /content; use NewContentClient")
|
||||
}
|
||||
|
||||
func (c *scrapeClient) ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: marshal request: %w", err)
|
||||
}
|
||||
|
||||
url := c.cfg.BaseURL + "/scrape"
|
||||
if c.cfg.Token != "" {
|
||||
url += "?token=" + c.cfg.Token
|
||||
}
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: do request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: unexpected status %d: %s", resp.StatusCode, b)
|
||||
}
|
||||
|
||||
var result ScrapeResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("scrape: decode response: %w", err)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *scrapeClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
||||
return fmt.Errorf("scrape client does not support CDP; use NewCDPClient")
|
||||
}
|
||||
@@ -1,68 +0,0 @@
|
||||
package browser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type httpClient struct {
|
||||
cfg Config
|
||||
http *http.Client
|
||||
sem chan struct{}
|
||||
}
|
||||
|
||||
func NewDirectHTTPClient(cfg Config) BrowserClient {
|
||||
if cfg.Timeout == 0 {
|
||||
cfg.Timeout = 30 * time.Second
|
||||
}
|
||||
return &httpClient{
|
||||
cfg: cfg,
|
||||
http: &http.Client{Timeout: cfg.Timeout},
|
||||
sem: makeSem(cfg.MaxConcurrent),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *httpClient) Strategy() Strategy { return StrategyDirect }
|
||||
|
||||
func (c *httpClient) GetContent(ctx context.Context, req ContentRequest) (string, error) {
|
||||
if err := acquire(ctx, c.sem); err != nil {
|
||||
return "", fmt.Errorf("http: semaphore: %w", err)
|
||||
}
|
||||
defer release(c.sem)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, req.URL, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: do request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("http: unexpected status %d: %s", resp.StatusCode, b)
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("http: read body: %w", err)
|
||||
}
|
||||
return string(raw), nil
|
||||
}
|
||||
|
||||
func (c *httpClient) ScrapePage(_ context.Context, _ ScrapeRequest) (ScrapeResponse, error) {
|
||||
return ScrapeResponse{}, fmt.Errorf("http client does not support ScrapePage; use browserless")
|
||||
}
|
||||
|
||||
func (c *httpClient) CDPSession(_ context.Context, _ string, _ CDPSessionFunc) error {
|
||||
return fmt.Errorf("http client does not support CDP; use browserless")
|
||||
}
|
||||
@@ -1,152 +0,0 @@
|
||||
//go:build integration
|
||||
|
||||
// Integration tests for the Browserless /content API.
|
||||
//
|
||||
// These tests require a live Browserless instance and are gated behind the
|
||||
// "integration" build tag so they never run in normal `go test ./...` passes.
|
||||
//
|
||||
// Run them with:
|
||||
//
|
||||
// BROWSERLESS_URL=http://localhost:3030 \
|
||||
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
||||
// go test -v -tags integration -timeout 120s \
|
||||
// github.com/libnovel/scraper/internal/browser
|
||||
package browser_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
)
|
||||
|
||||
// chapterURL is the novelfire chapter used in every integration sub-test.
|
||||
const chapterURL = "https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1"
|
||||
|
||||
// newIntegrationClient reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
||||
// environment and returns a configured contentClient.
|
||||
// The test is skipped when BROWSERLESS_URL is not set.
|
||||
func newIntegrationClient(t *testing.T) browser.BrowserClient {
|
||||
t.Helper()
|
||||
baseURL := os.Getenv("BROWSERLESS_URL")
|
||||
if baseURL == "" {
|
||||
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
||||
}
|
||||
return browser.NewContentClient(browser.Config{
|
||||
BaseURL: baseURL,
|
||||
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
||||
// Use a generous per-request HTTP timeout so the wait-for-selector
|
||||
// (75 s) doesn't get cut off by the transport layer.
|
||||
Timeout: 120 * time.Second,
|
||||
MaxConcurrent: 1,
|
||||
})
|
||||
}
|
||||
|
||||
// TestIntegration_ChapterContent_ReturnsHTML verifies that a POST /content
|
||||
// request with the production wait-for-selector settings succeeds and that the
|
||||
// returned HTML contains the #content div expected on novelfire chapter pages.
|
||||
func TestIntegration_ChapterContent_ReturnsHTML(t *testing.T) {
|
||||
client := newIntegrationClient(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req := browser.ContentRequest{
|
||||
URL: chapterURL,
|
||||
WaitFor: &browser.WaitForSelector{
|
||||
Selector: "#content",
|
||||
Timeout: 5000,
|
||||
},
|
||||
RejectResourceTypes: productionRejectTypes(),
|
||||
}
|
||||
|
||||
html, err := client.GetContent(ctx, req)
|
||||
if err != nil {
|
||||
t.Fatalf("GetContent failed: %v", err)
|
||||
}
|
||||
|
||||
// The #content div must not be empty; presence of <p> tags inside it is a
|
||||
// reliable indicator that chapter paragraphs were rendered.
|
||||
contentIdx := strings.Index(html, `id="content"`)
|
||||
if contentIdx == -1 {
|
||||
t.Fatalf("id=\"content\" not found in response (%d bytes)", len(html))
|
||||
}
|
||||
|
||||
// Look for <p> tags after the #content marker — the chapter text lives there.
|
||||
afterContent := html[contentIdx:]
|
||||
if !strings.Contains(afterContent, "<p") {
|
||||
t.Errorf("#content section contains no <p> tags; JS rendering may have failed.\nSection preview:\n%s",
|
||||
truncate(afterContent, 1000))
|
||||
}
|
||||
|
||||
t.Logf("chapter content section starts at byte %d (total response: %d bytes)", contentIdx, len(html))
|
||||
}
|
||||
|
||||
// TestIntegration_ChapterContent_TimeoutSurfacedCorrectly verifies that a
|
||||
// deliberately too-short timeout returns an error containing "TimeoutError" (the
|
||||
// Browserless error string seen in the failing log entry). This ensures our
|
||||
// error-classification logic in retryGetContent matches real Browserless output.
|
||||
func TestIntegration_ChapterContent_TimeoutSurfacedCorrectly(t *testing.T) {
|
||||
client := newIntegrationClient(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 40*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req := browser.ContentRequest{
|
||||
URL: chapterURL,
|
||||
WaitFor: &browser.WaitForSelector{
|
||||
Selector: "#content",
|
||||
Timeout: 500, // intentionally too short (500 ms) → Browserless will time out
|
||||
},
|
||||
RejectResourceTypes: productionRejectTypes(),
|
||||
}
|
||||
|
||||
_, err := client.GetContent(ctx, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected a timeout error from Browserless, but GetContent succeeded — " +
|
||||
"the page may now load very fast; adjust the timeout threshold")
|
||||
}
|
||||
|
||||
t.Logf("got expected error: %v", err)
|
||||
|
||||
// Browserless wraps navigation timeouts in a 500 response with
|
||||
// "TimeoutError: Navigation timeout" in the body — this is the exact
|
||||
// error that is triggering retries in production.
|
||||
if !strings.Contains(err.Error(), "500") {
|
||||
t.Errorf("expected HTTP 500 status in error, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
// productionRejectTypes returns the same resource-type block-list the
|
||||
// novelfire scraper uses in production, so integration tests exercise the
|
||||
// identical request shape.
|
||||
func productionRejectTypes() []string {
|
||||
return []string{
|
||||
"cspviolationreport",
|
||||
"eventsource",
|
||||
"fedcm",
|
||||
"font",
|
||||
"image",
|
||||
"manifest",
|
||||
"media",
|
||||
"other",
|
||||
"ping",
|
||||
"signedexchange",
|
||||
"stylesheet",
|
||||
"texttrack",
|
||||
"websocket",
|
||||
}
|
||||
}
|
||||
|
||||
// truncate returns the first n bytes of s as a string.
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
@@ -1,120 +0,0 @@
|
||||
// Package browser defines the BrowserClient interface and helper types for
|
||||
// communicating with a Browserless instance.
|
||||
package browser
|
||||
|
||||
import "context"
|
||||
|
||||
// Strategy selects which Browserless API endpoint / protocol to use.
|
||||
type Strategy string
|
||||
|
||||
const (
|
||||
// StrategyContent uses the POST /content endpoint, which returns the final
|
||||
// rendered HTML of the page. Fastest; suitable for most JS-rendered sites.
|
||||
StrategyContent Strategy = "content"
|
||||
|
||||
// StrategyScrape uses the POST /scrape endpoint, which accepts a list of
|
||||
// CSS selectors and returns structured JSON. Good when you know exactly
|
||||
// which elements you need.
|
||||
StrategyScrape Strategy = "scrape"
|
||||
|
||||
// StrategyCDP uses the WebSocket /devtools/browser endpoint (Chrome
|
||||
// DevTools Protocol). Most powerful; required for complex interactions
|
||||
// (clicking, scrolling, waiting for network idle, etc.).
|
||||
StrategyCDP Strategy = "cdp"
|
||||
|
||||
// StrategyDirect uses a plain HTTP client to fetch HTML directly.
|
||||
// Suitable for sites that don't require JavaScript rendering.
|
||||
StrategyDirect Strategy = "direct"
|
||||
)
|
||||
|
||||
// WaitForSelector describes the waitForSelector option sent to Browserless.
|
||||
type WaitForSelector struct {
|
||||
Selector string `json:"selector"`
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
}
|
||||
|
||||
// GotoOptions controls page navigation behavior.
|
||||
type GotoOptions struct {
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
WaitUntil string `json:"waitUntil,omitempty"` // e.g., "networkidle2", "load"
|
||||
}
|
||||
|
||||
// ContentRequest is the body sent to POST /content.
|
||||
type ContentRequest struct {
|
||||
URL string `json:"url"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
WaitForTimeout int `json:"waitForTimeout,omitempty"` // ms
|
||||
RejectResourceTypes []string `json:"rejectResourceTypes,omitempty"` // e.g. ["image","stylesheet"]
|
||||
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
||||
BestAttempt bool `json:"bestAttempt,omitempty"` // return partial content on timeout/error
|
||||
}
|
||||
|
||||
// ScrapeElement is one element descriptor inside a ScrapeRequest.
|
||||
type ScrapeElement struct {
|
||||
Selector string `json:"selector"`
|
||||
Timeout int `json:"timeout,omitempty"` // ms
|
||||
}
|
||||
|
||||
// ScrapeRequest is the body sent to POST /scrape.
|
||||
type ScrapeRequest struct {
|
||||
URL string `json:"url"`
|
||||
Elements []ScrapeElement `json:"elements"`
|
||||
WaitFor *WaitForSelector `json:"waitForSelector,omitempty"`
|
||||
GotoOptions *GotoOptions `json:"gotoOptions,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResult is one entry in the response from POST /scrape.
|
||||
type ScrapeResult struct {
|
||||
Selector string `json:"selector"`
|
||||
Results []ScrapeElement `json:"results"`
|
||||
}
|
||||
|
||||
// ScrapeAttribute holds a single attribute value from a scraped element.
|
||||
type ScrapeAttribute struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
// ScrapedElement is one item inside ScrapeResult.Results.
|
||||
type ScrapedElement struct {
|
||||
Text string `json:"text"`
|
||||
Attributes []ScrapeAttribute `json:"attributes"`
|
||||
}
|
||||
|
||||
// ScrapeResponse is the top-level response from POST /scrape.
|
||||
type ScrapeResponse struct {
|
||||
Data []ScrapeResult `json:"data"`
|
||||
}
|
||||
|
||||
// BrowserClient is an abstraction over the three Browserless API strategies.
|
||||
// Callers choose the strategy best suited to the target site; the interface
|
||||
// signature is identical regardless of strategy.
|
||||
type BrowserClient interface {
|
||||
// Strategy returns the strategy this client uses.
|
||||
Strategy() Strategy
|
||||
|
||||
// GetContent fetches the fully-rendered HTML of url using the /content
|
||||
// endpoint. Only meaningful when Strategy() == StrategyContent.
|
||||
GetContent(ctx context.Context, req ContentRequest) (string, error)
|
||||
|
||||
// ScrapePage calls the /scrape endpoint and returns structured data.
|
||||
// Only meaningful when Strategy() == StrategyScrape.
|
||||
ScrapePage(ctx context.Context, req ScrapeRequest) (ScrapeResponse, error)
|
||||
|
||||
// CDPSession opens a CDP WebSocket session and calls fn with the raw
|
||||
// WebSocket connection. Only meaningful when Strategy() == StrategyCDP.
|
||||
// The session is closed when fn returns.
|
||||
CDPSession(ctx context.Context, pageURL string, fn CDPSessionFunc) error
|
||||
}
|
||||
|
||||
// CDPSessionFunc is the callback invoked inside a CDP session.
|
||||
// conn is a live *websocket.Conn connected to a Browserless CDP endpoint.
|
||||
type CDPSessionFunc func(ctx context.Context, conn CDPConn) error
|
||||
|
||||
// CDPConn is the minimal interface the orchestrator needs over a CDP WebSocket.
|
||||
type CDPConn interface {
|
||||
// Send sends a raw CDP command (JSON-encoded) and returns the response.
|
||||
Send(ctx context.Context, method string, params map[string]any) (map[string]any, error)
|
||||
// Close closes the underlying connection.
|
||||
Close() error
|
||||
}
|
||||
@@ -1,344 +0,0 @@
|
||||
//go:build integration
|
||||
|
||||
// Integration tests for the novelfire.net Scraper against a live Browserless instance.
|
||||
//
|
||||
// These tests exercise the full scraping stack — Browserless → raw HTML →
|
||||
// novelfire HTML parser — for the book:
|
||||
//
|
||||
// https://novelfire.net/book/a-dragon-against-the-whole-world
|
||||
//
|
||||
// They are gated behind the "integration" build tag so they never run in a
|
||||
// normal `go test ./...` pass.
|
||||
//
|
||||
// Run with:
|
||||
//
|
||||
// BROWSERLESS_URL=http://localhost:3030 \
|
||||
// BROWSERLESS_TOKEN=your-token \ # omit if auth is disabled
|
||||
// go test -v -tags integration -timeout 600s \
|
||||
// github.com/libnovel/scraper/internal/novelfire
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
)
|
||||
|
||||
const (
|
||||
integrationBookURL = "https://novelfire.net/book/a-dragon-against-the-whole-world"
|
||||
integrationBookSlug = "a-dragon-against-the-whole-world"
|
||||
integrationBookTitle = "A Dragon against the Whole World"
|
||||
)
|
||||
|
||||
// newIntegrationScraper reads BROWSERLESS_URL / BROWSERLESS_TOKEN from the
|
||||
// environment, constructs a real contentClient, and returns a novelfire Scraper
|
||||
// wired to it. The test is skipped when BROWSERLESS_URL is not set.
|
||||
func newIntegrationScraper(t *testing.T) *Scraper {
|
||||
t.Helper()
|
||||
baseURL := os.Getenv("BROWSERLESS_URL")
|
||||
if baseURL == "" {
|
||||
t.Skip("BROWSERLESS_URL not set — skipping integration test")
|
||||
}
|
||||
client := browser.NewContentClient(browser.Config{
|
||||
BaseURL: baseURL,
|
||||
Token: os.Getenv("BROWSERLESS_TOKEN"),
|
||||
Timeout: 120 * time.Second,
|
||||
MaxConcurrent: 1,
|
||||
})
|
||||
return New(client, nil)
|
||||
}
|
||||
|
||||
// ── Metadata ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle verifies that
|
||||
// ScrapeMetadata fetches the book page and correctly parses at minimum
|
||||
// the slug, title, and source URL.
|
||||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsTitle(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("slug: %s", meta.Slug)
|
||||
t.Logf("title: %s", meta.Title)
|
||||
t.Logf("author: %s", meta.Author)
|
||||
t.Logf("status: %s", meta.Status)
|
||||
t.Logf("genres: %v", meta.Genres)
|
||||
t.Logf("total_chapters: %d", meta.TotalChapters)
|
||||
t.Logf("source_url: %s", meta.SourceURL)
|
||||
|
||||
if meta.Slug != integrationBookSlug {
|
||||
t.Errorf("slug = %q, want %q", meta.Slug, integrationBookSlug)
|
||||
}
|
||||
if meta.Title == "" {
|
||||
t.Error("title is empty")
|
||||
}
|
||||
if !strings.EqualFold(meta.Title, integrationBookTitle) {
|
||||
// Warn rather than hard-fail — the site may reword the title.
|
||||
t.Logf("WARN: title = %q, expected something like %q", meta.Title, integrationBookTitle)
|
||||
}
|
||||
if meta.SourceURL != integrationBookURL {
|
||||
t.Errorf("source_url = %q, want %q", meta.SourceURL, integrationBookURL)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields verifies that
|
||||
// every optional field (author, status, genres, summary, total_chapters) is
|
||||
// populated. A missing field is a warning, not a hard failure, because the
|
||||
// site may change its HTML structure.
|
||||
func TestIntegration_Novelfire_ScrapeMetadata_ReturnsFullFields(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
meta, err := s.ScrapeMetadata(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
type check struct {
|
||||
field string
|
||||
empty bool
|
||||
}
|
||||
checks := []check{
|
||||
{"author", meta.Author == ""},
|
||||
{"status", meta.Status == ""},
|
||||
{"summary", meta.Summary == ""},
|
||||
{"genres", len(meta.Genres) == 0},
|
||||
{"total_chapters", meta.TotalChapters == 0},
|
||||
}
|
||||
for _, c := range checks {
|
||||
if c.empty {
|
||||
t.Errorf("field %q is empty — HTML selector may have broken", c.field)
|
||||
}
|
||||
}
|
||||
|
||||
// total_chapters must be a positive integer.
|
||||
if meta.TotalChapters < 1 {
|
||||
t.Errorf("total_chapters = %d, want >= 1", meta.TotalChapters)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Chapter list ──────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs verifies that
|
||||
// ScrapeChapterList returns a non-empty slice of chapter references with
|
||||
// valid URLs and numbers parsed from those URLs (not list position).
|
||||
func TestIntegration_Novelfire_ScrapeChapterList_ReturnsRefs(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("total refs returned: %d", len(refs))
|
||||
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
// Every ref must have a non-empty URL pointing at the correct book.
|
||||
for i, ref := range refs {
|
||||
if ref.URL == "" {
|
||||
t.Errorf("refs[%d].URL is empty", i)
|
||||
}
|
||||
if !strings.Contains(ref.URL, integrationBookSlug) {
|
||||
t.Errorf("refs[%d].URL %q does not contain book slug", i, ref.URL)
|
||||
}
|
||||
if ref.Number <= 0 {
|
||||
t.Errorf("refs[%d].Number = %d, want > 0 (URL: %s)", i, ref.Number, ref.URL)
|
||||
}
|
||||
if ref.Title == "" {
|
||||
t.Errorf("refs[%d].Title is empty (URL: %s)", i, ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs verifies the
|
||||
// fix for the newest-first ordering bug: each ref's Number must equal the
|
||||
// chapter number embedded in its URL, not its position in the list.
|
||||
func TestIntegration_Novelfire_ScrapeChapterList_NumbersMatchURLs(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(ctx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
mismatches := 0
|
||||
for i, ref := range refs {
|
||||
wantNum := chapterNumberFromURL(ref.URL)
|
||||
if wantNum <= 0 {
|
||||
// URL has no parseable number — skip this entry.
|
||||
continue
|
||||
}
|
||||
if ref.Number != wantNum {
|
||||
t.Errorf("refs[%d]: Number=%d but URL %q implies number=%d (position-based bug?)",
|
||||
i, ref.Number, ref.URL, wantNum)
|
||||
mismatches++
|
||||
if mismatches >= 5 {
|
||||
t.Log("… (further mismatches suppressed)")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log the first few refs so failures are easy to diagnose.
|
||||
limit := 5
|
||||
if len(refs) < limit {
|
||||
limit = len(refs)
|
||||
}
|
||||
for i := 0; i < limit; i++ {
|
||||
t.Logf("refs[%d]: Number=%d Title=%q URL=%s", i, refs[i].Number, refs[i].Title, refs[i].URL)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Chapters ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeFirst3Chapters scrapes chapters 1, 2, and 3
|
||||
// via ScrapeChapterText and verifies each returns non-empty markdown text.
|
||||
// Chapters are run as sub-tests so a single failure does not abort the others.
|
||||
func TestIntegration_Novelfire_ScrapeFirst3Chapters(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
chapters := []scraper.ChapterRef{
|
||||
{
|
||||
Number: 1,
|
||||
Title: "Chapter 1",
|
||||
URL: integrationBookURL + "/chapter-1",
|
||||
},
|
||||
{
|
||||
Number: 2,
|
||||
Title: "Chapter 2",
|
||||
URL: integrationBookURL + "/chapter-2",
|
||||
},
|
||||
{
|
||||
Number: 3,
|
||||
Title: "Chapter 3",
|
||||
URL: integrationBookURL + "/chapter-3",
|
||||
},
|
||||
}
|
||||
|
||||
for _, ref := range chapters {
|
||||
ref := ref // capture
|
||||
t.Run(fmt.Sprintf("chapter-%d", ref.Number), func(t *testing.T) {
|
||||
// Sequential: each chapter needs its own generous timeout.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterText failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("chapter %d: %d bytes of markdown", ref.Number, len(ch.Text))
|
||||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||||
|
||||
// Ref fields must be echoed back unchanged.
|
||||
if ch.Ref.Number != ref.Number {
|
||||
t.Errorf("Ref.Number = %d, want %d", ch.Ref.Number, ref.Number)
|
||||
}
|
||||
if ch.Ref.URL != ref.URL {
|
||||
t.Errorf("Ref.URL = %q, want %q", ch.Ref.URL, ref.URL)
|
||||
}
|
||||
|
||||
// Text must be non-trivially long.
|
||||
if len(ch.Text) < 100 {
|
||||
t.Errorf("Text too short (%d bytes) — likely empty or parsing failed:\n%s",
|
||||
len(ch.Text), ch.Text)
|
||||
}
|
||||
|
||||
// Text must not contain raw HTML tags — NodeToMarkdown should have
|
||||
// stripped them.
|
||||
for _, tag := range []string{"<div", "<span", "<script", "<style"} {
|
||||
if strings.Contains(ch.Text, tag) {
|
||||
t.Errorf("Text contains raw HTML tag %q — markdown conversion may be broken", tag)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList is the end-to-end
|
||||
// variant: it first calls ScrapeChapterList to get the real refs (with
|
||||
// URL-derived numbers), then scrapes chapters 1–3 using those refs.
|
||||
// This catches any discrepancy between the list and the chapter URLs.
|
||||
func TestIntegration_Novelfire_ScrapeFirst3Chapters_FromList(t *testing.T) {
|
||||
s := newIntegrationScraper(t)
|
||||
|
||||
// Step 1: fetch the chapter list.
|
||||
listCtx, listCancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer listCancel()
|
||||
|
||||
refs, err := s.ScrapeChapterList(listCtx, integrationBookURL)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterList failed: %v", err)
|
||||
}
|
||||
if len(refs) == 0 {
|
||||
t.Fatal("ScrapeChapterList returned 0 refs")
|
||||
}
|
||||
|
||||
// Build a map number→ref for fast lookup.
|
||||
byNumber := make(map[int]scraper.ChapterRef, len(refs))
|
||||
for _, r := range refs {
|
||||
byNumber[r.Number] = r
|
||||
}
|
||||
|
||||
// Step 2: scrape chapters 1, 2, 3.
|
||||
for _, wantNum := range []int{1, 2, 3} {
|
||||
wantNum := wantNum
|
||||
ref, ok := byNumber[wantNum]
|
||||
if !ok {
|
||||
t.Errorf("chapter %d not found in chapter list (list has %d entries)", wantNum, len(refs))
|
||||
continue
|
||||
}
|
||||
|
||||
t.Run(fmt.Sprintf("chapter-%d", wantNum), func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 110*time.Second)
|
||||
defer cancel()
|
||||
|
||||
ch, err := s.ScrapeChapterText(ctx, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("ScrapeChapterText(chapter %d, %s) failed: %v", wantNum, ref.URL, err)
|
||||
}
|
||||
|
||||
t.Logf("chapter %d (%q): %d bytes", wantNum, ref.Title, len(ch.Text))
|
||||
t.Logf("first 300 chars:\n%s", truncateStr(ch.Text, 300))
|
||||
|
||||
if len(ch.Text) < 100 {
|
||||
t.Errorf("chapter %d text too short (%d bytes)", wantNum, len(ch.Text))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func truncateStr(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
@@ -1,296 +0,0 @@
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
// rankingPage1HTML is a realistic mock of the popular genre listing page
|
||||
// (novelfire.net/genre-all/sort-popular/status-all/all-novel?page=1).
|
||||
// It uses the real novelfire.net DOM: <li class="novel-item"> cards with
|
||||
// <h4 class="novel-title"> and a rel="next" pagination link.
|
||||
func rankingPage1HTML() string {
|
||||
return `<!DOCTYPE html>
|
||||
<html><body>
|
||||
<ul class="list-novel">
|
||||
<li class="novel-item">
|
||||
<a title="The Iron Throne" href="/book/the-iron-throne">
|
||||
<figure class="novel-cover"><img class="lazy" src="data:image/gif;base64,R0lG" data-src="/covers/iron-throne.jpg" alt="The Iron Throne"></figure>
|
||||
<h4 class="novel-title text2row">The Iron Throne</h4>
|
||||
</a>
|
||||
<div class="novel-stats"><i class="icon-book-open"></i> 500 Chapters</div>
|
||||
</li>
|
||||
<li class="novel-item">
|
||||
<a title="Shadow Mage" href="/book/shadow-mage">
|
||||
<figure class="novel-cover"><img class="lazy" src="data:image/gif;base64,R0lG" data-src="/covers/shadow-mage.jpg" alt="Shadow Mage"></figure>
|
||||
<h4 class="novel-title text2row">Shadow Mage</h4>
|
||||
</a>
|
||||
<div class="novel-stats"><i class="icon-book-open"></i> 200 Chapters</div>
|
||||
</li>
|
||||
</ul>
|
||||
<ul class="pagination">
|
||||
<li class="page-item active"><span class="page-link">1</span></li>
|
||||
<li class="page-item"><a class="page-link" href="/genre-all/sort-popular/status-all/all-novel?page=2" rel="next" aria-label="Next">›</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
}
|
||||
|
||||
func rankingPage2HTML() string {
|
||||
return `<!DOCTYPE html>
|
||||
<html><body>
|
||||
<ul class="list-novel">
|
||||
<li class="novel-item">
|
||||
<a title="Void Hunter" href="/book/void-hunter">
|
||||
<figure class="novel-cover"><img class="lazy" src="data:image/gif;base64,R0lG" data-src="/covers/void-hunter.jpg" alt="Void Hunter"></figure>
|
||||
<h4 class="novel-title text2row">Void Hunter</h4>
|
||||
</a>
|
||||
<div class="novel-stats"><i class="icon-book-open"></i> 100 Chapters</div>
|
||||
</li>
|
||||
</ul>
|
||||
<!-- no rel="next" link → last page -->
|
||||
<ul class="pagination">
|
||||
<li class="page-item"><a class="page-link" href="?page=1" rel="prev">‹</a></li>
|
||||
<li class="page-item active"><span class="page-link">2</span></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
}
|
||||
|
||||
// drainRanking collects all entries from a ScrapeRanking call without
|
||||
// deadlocking. It uses "for A != nil || B != nil" — nil channels are never
|
||||
// selected, so setting one to nil effectively removes it from the select.
|
||||
func drainRanking(t *testing.T, entryCh <-chan scraper.BookMeta, errCh <-chan error) []scraper.BookMeta {
|
||||
t.Helper()
|
||||
var entries []scraper.BookMeta
|
||||
for entryCh != nil || errCh != nil {
|
||||
select {
|
||||
case meta, ok := <-entryCh:
|
||||
if !ok {
|
||||
entryCh = nil
|
||||
} else {
|
||||
entries = append(entries, meta)
|
||||
}
|
||||
case err, ok := <-errCh:
|
||||
if !ok {
|
||||
errCh = nil
|
||||
} else if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
// TestScrapeRanking_SinglePage verifies a single page is parsed into entries
|
||||
// with sequential Ranking numbers using a stub client.
|
||||
// ScrapeRanking uses s.client (the main client, not urlClient) because the
|
||||
// ranking page is fully server-rendered.
|
||||
func TestScrapeRanking_SinglePage(t *testing.T) {
|
||||
// newScraper passes the stub as s.client — exactly what ScrapeRanking uses.
|
||||
s := newScraper(rankingPage1HTML())
|
||||
entryCh, errCh := s.ScrapeRanking(context.Background(), 1)
|
||||
entries := drainRanking(t, entryCh, errCh)
|
||||
|
||||
if len(entries) != 2 {
|
||||
t.Fatalf("expected 2 entries, got %d", len(entries))
|
||||
}
|
||||
if entries[0].Ranking != 1 || entries[0].Title != "The Iron Throne" {
|
||||
t.Errorf("entry[0]: got rank=%d title=%q, want rank=1 title=%q",
|
||||
entries[0].Ranking, entries[0].Title, "The Iron Throne")
|
||||
}
|
||||
if entries[1].Ranking != 2 || entries[1].Title != "Shadow Mage" {
|
||||
t.Errorf("entry[1]: got rank=%d title=%q, want rank=2 title=%q",
|
||||
entries[1].Ranking, entries[1].Title, "Shadow Mage")
|
||||
}
|
||||
}
|
||||
|
||||
// TestScrapeRanking_MultiPage verifies pagination across two pages yields
|
||||
// contiguous rank numbers (1, 2, 3).
|
||||
func TestScrapeRanking_MultiPage(t *testing.T) {
|
||||
// Use pagedStubClient for s.client so each GetContent call returns the
|
||||
// next page. ScrapeRanking now calls s.client directly.
|
||||
urlClient := &pagedStubClient{pages: []string{rankingPage1HTML(), rankingPage2HTML()}}
|
||||
s := New(urlClient, nil, nil, nil) // nil cache — no disk I/O in tests
|
||||
|
||||
entryCh, errCh := s.ScrapeRanking(context.Background(), 0) // 0 = all pages
|
||||
entries := drainRanking(t, entryCh, errCh)
|
||||
|
||||
if len(entries) != 3 {
|
||||
t.Fatalf("expected 3 entries across 2 pages, got %d", len(entries))
|
||||
}
|
||||
want := []struct {
|
||||
rank int
|
||||
title string
|
||||
}{
|
||||
{1, "The Iron Throne"},
|
||||
{2, "Shadow Mage"},
|
||||
{3, "Void Hunter"},
|
||||
}
|
||||
for i, w := range want {
|
||||
if entries[i].Ranking != w.rank || entries[i].Title != w.title {
|
||||
t.Errorf("entry[%d]: got rank=%d title=%q, want rank=%d title=%q",
|
||||
i, entries[i].Ranking, entries[i].Title, w.rank, w.title)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestScrapeRanking_EmptyPage verifies that a page with no .novel-item
|
||||
// cards produces zero entries and closes channels cleanly (no deadlock).
|
||||
func TestScrapeRanking_EmptyPage(t *testing.T) {
|
||||
s := newScraper(`<!DOCTYPE html><html><body><div class="no-rankings"></div></body></html>`)
|
||||
entryCh, errCh := s.ScrapeRanking(context.Background(), 1)
|
||||
entries := drainRanking(t, entryCh, errCh)
|
||||
|
||||
if len(entries) != 0 {
|
||||
t.Errorf("expected 0 entries for empty page, got %d", len(entries))
|
||||
}
|
||||
}
|
||||
|
||||
// TestWriteRanking_RoundTrip verifies WriteRanking → ReadRankingItems
|
||||
// faithfully reconstructs the original slice.
|
||||
func TestWriteRanking_RoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
w := writer.New(dir)
|
||||
|
||||
items := []writer.RankingItem{
|
||||
{Rank: 1, Slug: "the-iron-throne", Title: "The Iron Throne", Status: "Ongoing",
|
||||
Genres: []string{"Fantasy", "Action"}, SourceURL: "https://novelfire.net/book/the-iron-throne"},
|
||||
{Rank: 2, Slug: "shadow-mage", Title: "Shadow Mage", Status: "Completed",
|
||||
Genres: []string{"Magic"}, SourceURL: "https://novelfire.net/book/shadow-mage"},
|
||||
}
|
||||
|
||||
if err := w.WriteRanking(items); err != nil {
|
||||
t.Fatalf("WriteRanking failed: %v", err)
|
||||
}
|
||||
|
||||
rankingFile := filepath.Join(dir, "ranking.json")
|
||||
if _, err := os.Stat(rankingFile); err != nil {
|
||||
t.Fatalf("ranking.json not created: %v", err)
|
||||
}
|
||||
|
||||
got, err := w.ReadRankingItems()
|
||||
if err != nil {
|
||||
t.Fatalf("ReadRankingItems failed: %v", err)
|
||||
}
|
||||
if len(got) != len(items) {
|
||||
t.Fatalf("expected %d items, got %d", len(items), len(got))
|
||||
}
|
||||
for i, want := range items {
|
||||
if got[i].Rank != want.Rank {
|
||||
t.Errorf("item[%d].Rank = %d, want %d", i, got[i].Rank, want.Rank)
|
||||
}
|
||||
if got[i].Slug != want.Slug {
|
||||
t.Errorf("item[%d].Slug = %q, want %q", i, got[i].Slug, want.Slug)
|
||||
}
|
||||
if got[i].Title != want.Title {
|
||||
t.Errorf("item[%d].Title = %q, want %q", i, got[i].Title, want.Title)
|
||||
}
|
||||
if got[i].Status != want.Status {
|
||||
t.Errorf("item[%d].Status = %q, want %q", i, got[i].Status, want.Status)
|
||||
}
|
||||
if len(got[i].Genres) != len(want.Genres) {
|
||||
t.Errorf("item[%d].Genres len = %d, want %d", i, len(got[i].Genres), len(want.Genres))
|
||||
} else {
|
||||
for j, g := range want.Genres {
|
||||
if got[i].Genres[j] != g {
|
||||
t.Errorf("item[%d].Genres[%d] = %q, want %q", i, j, got[i].Genres[j], g)
|
||||
}
|
||||
}
|
||||
}
|
||||
if got[i].SourceURL != want.SourceURL {
|
||||
t.Errorf("item[%d].SourceURL = %q, want %q", i, got[i].SourceURL, want.SourceURL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── in-memory page cacher ─────────────────────────────────────────────────────
|
||||
|
||||
// memPageCacher is a RankingPageCacher backed by an in-memory map.
|
||||
// It records how many times each page was written and exposes the stored HTML.
|
||||
type memPageCacher struct {
|
||||
pages map[int]string
|
||||
writes map[int]int
|
||||
}
|
||||
|
||||
func newMemPageCacher() *memPageCacher {
|
||||
return &memPageCacher{pages: make(map[int]string), writes: make(map[int]int)}
|
||||
}
|
||||
|
||||
func (c *memPageCacher) WriteRankingPageCache(page int, html string) error {
|
||||
c.pages[page] = html
|
||||
c.writes[page]++
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *memPageCacher) ReadRankingPageCache(page int) (string, error) {
|
||||
return c.pages[page], nil // returns "" on miss, satisfying the interface contract
|
||||
}
|
||||
|
||||
var _ scraper.RankingPageCacher = (*memPageCacher)(nil) // compile-time check
|
||||
|
||||
// TestScrapeRanking_CacheHit verifies that when a page is already in the cache
|
||||
// ScrapeRanking serves from cache and does NOT call the browser client.
|
||||
func TestScrapeRanking_CacheHit(t *testing.T) {
|
||||
cache := newMemPageCacher()
|
||||
// Pre-populate the cache with page 1 HTML.
|
||||
if err := cache.WriteRankingPageCache(1, rankingPage1HTML()); err != nil {
|
||||
t.Fatalf("cache write: %v", err)
|
||||
}
|
||||
cache.writes[1] = 0 // reset write counter — we only care about fetches
|
||||
|
||||
// The stub client panics on any GetContent call so we can prove it is not used.
|
||||
panicClient := &panicOnGetContent{}
|
||||
s := New(panicClient, nil, panicClient, cache)
|
||||
|
||||
entryCh, errCh := s.ScrapeRanking(context.Background(), 1)
|
||||
entries := drainRanking(t, entryCh, errCh)
|
||||
|
||||
if len(entries) != 2 {
|
||||
t.Fatalf("expected 2 entries from cache, got %d", len(entries))
|
||||
}
|
||||
// Cache should not have been written again (we served from cache).
|
||||
if cache.writes[1] != 0 {
|
||||
t.Errorf("expected 0 cache writes on a hit, got %d", cache.writes[1])
|
||||
}
|
||||
}
|
||||
|
||||
// TestScrapeRanking_CacheMiss verifies that on a cache miss the page is fetched
|
||||
// from the network and the result is written to the cache.
|
||||
func TestScrapeRanking_CacheMiss(t *testing.T) {
|
||||
cache := newMemPageCacher() // empty cache
|
||||
s := New(&stubClient{html: rankingPage1HTML()}, nil, nil, cache)
|
||||
|
||||
entryCh, errCh := s.ScrapeRanking(context.Background(), 1)
|
||||
entries := drainRanking(t, entryCh, errCh)
|
||||
|
||||
if len(entries) != 2 {
|
||||
t.Fatalf("expected 2 entries, got %d", len(entries))
|
||||
}
|
||||
if cache.writes[1] != 1 {
|
||||
t.Errorf("expected 1 cache write on a miss, got %d", cache.writes[1])
|
||||
}
|
||||
if cache.pages[1] == "" {
|
||||
t.Error("expected page 1 to be stored in cache after miss")
|
||||
}
|
||||
}
|
||||
|
||||
// panicOnGetContent is a BrowserClient whose GetContent panics, letting tests
|
||||
// assert that it is never called (i.e. the cache was used instead).
|
||||
type panicOnGetContent struct{}
|
||||
|
||||
func (p *panicOnGetContent) Strategy() browser.Strategy { return browser.StrategyContent }
|
||||
func (p *panicOnGetContent) GetContent(_ context.Context, req browser.ContentRequest) (string, error) {
|
||||
panic(fmt.Sprintf("unexpected GetContent call for URL %s — should have been served from cache", req.URL))
|
||||
}
|
||||
func (p *panicOnGetContent) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||||
return browser.ScrapeResponse{}, nil
|
||||
}
|
||||
func (p *panicOnGetContent) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||||
return nil
|
||||
}
|
||||
@@ -1,712 +0,0 @@
|
||||
// Package novelfire provides a NovelScraper implementation for novelfire.net.
|
||||
//
|
||||
// Site structure (as of 2025):
|
||||
//
|
||||
// Catalogue : https://novelfire.net/genre-all/sort-new/status-all/all-novel?page=N
|
||||
// Book page : https://novelfire.net/book/{slug}
|
||||
// Chapters : https://novelfire.net/book/{slug}/chapters?page=N
|
||||
// Chapter : https://novelfire.net/book/{slug}/{chapter-slug}
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/url"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/scraper/internal/scraper/htmlutil"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const (
|
||||
baseURL = "https://novelfire.net"
|
||||
cataloguePath = "/genre-all/sort-new/status-all/all-novel"
|
||||
rankingPath = "/genre-all/sort-popular/status-all/all-novel"
|
||||
)
|
||||
|
||||
// rejectResourceTypes lists Browserless resource types to block on every request.
|
||||
// We keep: document (the page), script (JS renders the DOM), fetch/xhr (JS data calls).
|
||||
// Everything else is safe to drop for HTML-only scraping.
|
||||
var rejectResourceTypes = []string{
|
||||
"cspviolationreport",
|
||||
"eventsource",
|
||||
"fedcm",
|
||||
"font",
|
||||
"image",
|
||||
"manifest",
|
||||
"media",
|
||||
"other",
|
||||
"ping",
|
||||
"signedexchange",
|
||||
"stylesheet",
|
||||
"texttrack",
|
||||
"websocket",
|
||||
}
|
||||
|
||||
// Scraper is the novelfire.net implementation of scraper.NovelScraper.
|
||||
// It uses the /content strategy by default (rendered HTML via Browserless).
|
||||
type Scraper struct {
|
||||
client browser.BrowserClient
|
||||
urlClient browser.BrowserClient // separate client for URL retrieval (uses browserless content strategy)
|
||||
pageCache scraper.RankingPageCacher
|
||||
log *slog.Logger
|
||||
}
|
||||
|
||||
// New returns a new novelfire Scraper.
|
||||
// client is used for content fetching, urlClient is used for URL retrieval (chapter list).
|
||||
// If urlClient is nil, client will be used for both.
|
||||
// pageCache is optional; pass nil to disable ranking page caching.
|
||||
func New(client browser.BrowserClient, log *slog.Logger, urlClient browser.BrowserClient, pageCache scraper.RankingPageCacher) *Scraper {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
if urlClient == nil {
|
||||
urlClient = client
|
||||
}
|
||||
return &Scraper{client: client, urlClient: urlClient, pageCache: pageCache, log: log}
|
||||
}
|
||||
|
||||
// SourceName implements NovelScraper.
|
||||
func (s *Scraper) SourceName() string { return "novelfire.net" }
|
||||
|
||||
// ─── CatalogueProvider ───────────────────────────────────────────────────────
|
||||
|
||||
// ScrapeCatalogue streams all CatalogueEntry values across all pages.
|
||||
func (s *Scraper) ScrapeCatalogue(ctx context.Context) (<-chan scraper.CatalogueEntry, <-chan error) {
|
||||
entries := make(chan scraper.CatalogueEntry, 64)
|
||||
errs := make(chan error, 16)
|
||||
|
||||
go func() {
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
pageURL := baseURL + cataloguePath
|
||||
page := 1
|
||||
|
||||
for pageURL != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
s.log.Info("scraping catalogue page", "page", page, "url", pageURL)
|
||||
s.log.Debug("catalogue page fetch starting",
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".novel-item",
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
html, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("catalogue page fetch failed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"err", err,
|
||||
)
|
||||
errs <- fmt.Errorf("catalogue page %d: %w", page, err)
|
||||
return
|
||||
}
|
||||
s.log.Debug("catalogue page fetch completed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"response_bytes", len(html),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(html)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("catalogue page %d parse: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Extract novel cards: <div class="novel-item">
|
||||
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "div", Class: "novel-item", Multiple: true})
|
||||
if len(cards) == 0 {
|
||||
s.log.Warn("no novel cards found, stopping pagination", "page", page)
|
||||
return
|
||||
}
|
||||
|
||||
for _, card := range cards {
|
||||
// Title: <h3 class="novel-title"><a href="/book/slug">Title</a>
|
||||
titleNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "h3", Class: "novel-title"})
|
||||
|
||||
var title, href string
|
||||
if titleNode != nil {
|
||||
linkNode := htmlutil.FindFirst(titleNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
if linkNode != nil {
|
||||
title = htmlutil.ExtractText(linkNode, scraper.Selector{})
|
||||
href = htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
}
|
||||
}
|
||||
if href == "" || title == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
bookURL := resolveURL(baseURL, href)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case entries <- scraper.CatalogueEntry{Title: title, URL: bookURL}:
|
||||
}
|
||||
}
|
||||
|
||||
// Find next page link: <a class="next" href="...">
|
||||
nextHref := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "a", Class: "next", Attr: "href"})
|
||||
if nextHref == "" {
|
||||
break
|
||||
}
|
||||
pageURL = resolveURL(baseURL, nextHref)
|
||||
page++
|
||||
}
|
||||
}()
|
||||
|
||||
return entries, errs
|
||||
}
|
||||
|
||||
// ─── MetadataProvider ────────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) ScrapeMetadata(ctx context.Context, bookURL string) (scraper.BookMeta, error) {
|
||||
s.log.Debug("metadata fetch starting",
|
||||
"payload_url", bookURL,
|
||||
"payload_wait_selector", ".novel-title",
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
raw, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: bookURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-title", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("metadata fetch failed", "url", bookURL, "err", err)
|
||||
return scraper.BookMeta{}, fmt.Errorf("metadata fetch %s: %w", bookURL, err)
|
||||
}
|
||||
s.log.Debug("metadata fetch completed", "url", bookURL, "response_bytes", len(raw))
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return scraper.BookMeta{}, fmt.Errorf("metadata parse %s: %w", bookURL, err)
|
||||
}
|
||||
|
||||
// <h1 class="novel-title">Title</h1>
|
||||
title := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "h1", Class: "novel-title"})
|
||||
// <span class="author"><a>Author Name</a></span>
|
||||
author := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "author"})
|
||||
// <figure class="cover"><img src="..."></figure>
|
||||
var cover string
|
||||
if figureCover := htmlutil.FindFirst(root, scraper.Selector{Tag: "figure", Class: "cover"}); figureCover != nil {
|
||||
cover = htmlutil.ExtractFirst(figureCover, scraper.Selector{Tag: "img", Attr: "src"})
|
||||
}
|
||||
// <span class="status">Ongoing</span>
|
||||
status := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "status"})
|
||||
|
||||
// Genres: all <a> tags inside <div class="genres">
|
||||
genresNode := htmlutil.FindFirst(root, scraper.Selector{Tag: "div", Class: "genres"})
|
||||
var genres []string
|
||||
if genresNode != nil {
|
||||
genres = htmlutil.ExtractAll(genresNode, scraper.Selector{Tag: "a", Multiple: true})
|
||||
}
|
||||
|
||||
// <div class="summary"><p>...</p></div>
|
||||
summary := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "div", Class: "summary"})
|
||||
// <span class="chapter-count">123 Chapters</span>
|
||||
totalStr := htmlutil.ExtractFirst(root, scraper.Selector{Tag: "span", Class: "chapter-count"})
|
||||
totalChapters := parseChapterCount(totalStr)
|
||||
|
||||
slug := slugFromURL(bookURL)
|
||||
|
||||
meta := scraper.BookMeta{
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Author: author,
|
||||
Cover: cover,
|
||||
Status: status,
|
||||
Genres: genres,
|
||||
Summary: summary,
|
||||
TotalChapters: totalChapters,
|
||||
SourceURL: bookURL,
|
||||
}
|
||||
s.log.Debug("metadata parsed",
|
||||
"url", bookURL,
|
||||
"slug", meta.Slug,
|
||||
"title", meta.Title,
|
||||
"author", meta.Author,
|
||||
"status", meta.Status,
|
||||
"genres", meta.Genres,
|
||||
"total_chapters", meta.TotalChapters,
|
||||
)
|
||||
return meta, nil
|
||||
}
|
||||
|
||||
// ─── ChapterListProvider ─────────────────────────────────────────────────────
|
||||
|
||||
func (s *Scraper) ScrapeChapterList(ctx context.Context, bookURL string) ([]scraper.ChapterRef, error) {
|
||||
var refs []scraper.ChapterRef
|
||||
// Chapter list URL: {bookURL}/chapters?page=N
|
||||
baseChapterURL := strings.TrimRight(bookURL, "/") + "/chapters"
|
||||
page := 1
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return refs, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := fmt.Sprintf("%s?page=%d", baseChapterURL, page)
|
||||
s.log.Info("scraping chapter list", "page", page, "url", pageURL)
|
||||
|
||||
s.log.Debug("chapter list fetch starting",
|
||||
"page", page,
|
||||
"payload_url", pageURL,
|
||||
"payload_wait_selector", ".chapter-list",
|
||||
"payload_wait_selector_timeout_ms", 15000,
|
||||
"payload_wait_timeout_ms", 2000,
|
||||
"strategy", s.urlClient.Strategy(),
|
||||
)
|
||||
|
||||
raw, err := s.urlClient.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
// Wait up to 15 s for the chapter list container to appear in the DOM.
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".chapter-list", Timeout: 15000},
|
||||
// After the selector is found, wait an additional 2 s for any
|
||||
// deferred JS rendering (lazy-loaded links, infinite-scroll hydration).
|
||||
WaitForTimeout: 2000,
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
// Do NOT use BestAttempt — we want a complete page or a clear error,
|
||||
// not silently partial HTML that looks like "no more chapters".
|
||||
BestAttempt: false,
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("chapter list fetch failed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"err", err,
|
||||
)
|
||||
return refs, fmt.Errorf("chapter list page %d: %w", page, err)
|
||||
}
|
||||
s.log.Debug("chapter list fetch completed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"response_bytes", len(raw),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return refs, fmt.Errorf("chapter list page %d parse: %w", page, err)
|
||||
}
|
||||
|
||||
chapterList := htmlutil.FindFirst(root, scraper.Selector{Class: "chapter-list"})
|
||||
if chapterList == nil {
|
||||
// No chapter list container on this page — we've gone past the last page.
|
||||
s.log.Debug("chapter list container not found, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
// Each chapter row: <li class="chapter-item"><a href="...">Title</a></li>
|
||||
items := htmlutil.FindAll(chapterList, scraper.Selector{Tag: "li"})
|
||||
|
||||
s.log.Debug("chapter list page parsed",
|
||||
"page", page,
|
||||
"url", pageURL,
|
||||
"chapters_on_page", len(items),
|
||||
"total_refs_so_far", len(refs),
|
||||
)
|
||||
|
||||
// Zero items on this page means we've gone past the last page.
|
||||
if len(items) == 0 {
|
||||
s.log.Debug("no chapters on page, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
linkNode := htmlutil.FindFirst(item, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
continue
|
||||
}
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Attr: "href"})
|
||||
chTitle := htmlutil.ExtractText(linkNode, scraper.Selector{})
|
||||
if href == "" {
|
||||
continue
|
||||
}
|
||||
chURL := resolveURL(baseURL, href)
|
||||
num := chapterNumberFromURL(chURL)
|
||||
if num <= 0 {
|
||||
// Fall back to position if the URL has no parseable number.
|
||||
num = len(refs) + 1
|
||||
s.log.Warn("chapter number not parseable from URL, falling back to position",
|
||||
"url", chURL,
|
||||
"position", num,
|
||||
)
|
||||
}
|
||||
refs = append(refs, scraper.ChapterRef{
|
||||
Number: num,
|
||||
Title: strings.TrimSpace(chTitle),
|
||||
URL: chURL,
|
||||
})
|
||||
}
|
||||
|
||||
page++
|
||||
}
|
||||
|
||||
return refs, nil
|
||||
}
|
||||
|
||||
// ─── RankingProvider ───────────────────────────────────────────────────────────
|
||||
|
||||
// hasNextPageLink returns true if the HTML document contains a pagination link
|
||||
// with rel="next". novelfire.net uses:
|
||||
//
|
||||
// <a class="page-link" href="...?page=N" rel="next" ...>
|
||||
func hasNextPageLink(root *html.Node) bool {
|
||||
links := htmlutil.FindAll(root, scraper.Selector{Tag: "a", Multiple: true})
|
||||
for _, a := range links {
|
||||
for _, attr := range a.Attr {
|
||||
if attr.Key == "rel" && attr.Val == "next" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ScrapeRanking pages through up to maxPages pages of the popular-novels genre
|
||||
// listing on novelfire.net (/genre-all/sort-popular/status-all/all-novel).
|
||||
// Pages are fetched one at a time, strictly sequentially.
|
||||
// maxPages <= 0 means "fetch all pages until no more are found".
|
||||
func (s *Scraper) ScrapeRanking(ctx context.Context, maxPages int) (<-chan scraper.BookMeta, <-chan error) {
|
||||
entries := make(chan scraper.BookMeta, 32)
|
||||
errs := make(chan error, 16)
|
||||
|
||||
go func() {
|
||||
defer close(entries)
|
||||
defer close(errs)
|
||||
|
||||
rank := 1
|
||||
|
||||
for page := 1; maxPages <= 0 || page <= maxPages; page++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
pageURL := fmt.Sprintf("%s%s?page=%d", baseURL, rankingPath, page)
|
||||
|
||||
// Try to serve from disk cache before hitting the network.
|
||||
var raw string
|
||||
if s.pageCache != nil {
|
||||
if cached, err := s.pageCache.ReadRankingPageCache(page); err != nil {
|
||||
s.log.Warn("ranking page cache read error", "page", page, "err", err)
|
||||
} else if cached != "" {
|
||||
s.log.Info("serving ranking page from cache", "page", page)
|
||||
raw = cached
|
||||
}
|
||||
}
|
||||
|
||||
if raw == "" {
|
||||
s.log.Info("scraping popular ranking page", "page", page, "url", pageURL)
|
||||
fetched, err := s.client.GetContent(ctx, browser.ContentRequest{
|
||||
URL: pageURL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: ".novel-item", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Debug("ranking page fetch failed", "page", page, "url", pageURL, "err", err)
|
||||
errs <- fmt.Errorf("ranking page %d: %w", page, err)
|
||||
return
|
||||
}
|
||||
raw = fetched
|
||||
|
||||
// Persist to cache for future runs.
|
||||
if s.pageCache != nil {
|
||||
if werr := s.pageCache.WriteRankingPageCache(page, raw); werr != nil {
|
||||
s.log.Warn("ranking page cache write error", "page", page, "err", werr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
errs <- fmt.Errorf("ranking page %d parse: %w", page, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Real novelfire.net popular listing structure:
|
||||
// <li class="novel-item">
|
||||
// <a href="/book/slug" title="Title">
|
||||
// <figure class="novel-cover"><img data-src="..."></figure>
|
||||
// <h4 class="novel-title text2row">Title</h4>
|
||||
// </a>
|
||||
// </li>
|
||||
cards := htmlutil.FindAll(root, scraper.Selector{Tag: "li", Class: "novel-item", Multiple: true})
|
||||
if len(cards) == 0 {
|
||||
s.log.Debug("no novel cards found, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
|
||||
for _, card := range cards {
|
||||
// The outer <a> carries the href and title attribute.
|
||||
linkNode := htmlutil.FindFirst(card, scraper.Selector{Tag: "a"})
|
||||
if linkNode == nil {
|
||||
continue
|
||||
}
|
||||
href := htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "href"})
|
||||
bookURL := resolveURL(baseURL, href)
|
||||
if bookURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Title: prefer <h4 class="novel-title"> text; fall back to <a title="...">
|
||||
title := strings.TrimSpace(htmlutil.ExtractFirst(card, scraper.Selector{Tag: "h4", Class: "novel-title"}))
|
||||
if title == "" {
|
||||
title = strings.TrimSpace(htmlutil.ExtractText(linkNode, scraper.Selector{Tag: "a", Attr: "title"}))
|
||||
}
|
||||
if title == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Cover: <figure class="novel-cover"><img data-src="...">
|
||||
var cover string
|
||||
if fig := htmlutil.FindFirst(card, scraper.Selector{Tag: "figure", Class: "novel-cover"}); fig != nil {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "data-src"})
|
||||
if cover == "" {
|
||||
cover = htmlutil.ExtractFirst(fig, scraper.Selector{Tag: "img", Attr: "src"})
|
||||
}
|
||||
// Filter out base64 placeholder images.
|
||||
if strings.HasPrefix(cover, "data:") {
|
||||
cover = ""
|
||||
}
|
||||
if cover != "" && !strings.HasPrefix(cover, "http") {
|
||||
cover = baseURL + cover
|
||||
}
|
||||
}
|
||||
|
||||
slug := slugFromURL(bookURL)
|
||||
|
||||
meta := scraper.BookMeta{
|
||||
Slug: slug,
|
||||
Title: title,
|
||||
Cover: cover,
|
||||
SourceURL: bookURL,
|
||||
Ranking: rank,
|
||||
}
|
||||
rank++
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case entries <- meta:
|
||||
}
|
||||
}
|
||||
|
||||
// Stop if no next-page link exists.
|
||||
// The real pagination uses <a rel="next" ...> inside .pagination.
|
||||
if !hasNextPageLink(root) {
|
||||
s.log.Debug("no next-page link found, stopping pagination", "page", page)
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return entries, errs
|
||||
}
|
||||
|
||||
// ─── ChapterTextProvider ─────────────────────────────────────────────────────
|
||||
|
||||
// retryGetContent calls client.GetContent up to maxAttempts times, backing off
|
||||
// exponentially between retries. Only errors that look like transient Browserless
|
||||
// failures (timeouts, 5xx responses) are retried; context cancellation and
|
||||
// permanent errors are returned immediately.
|
||||
func retryGetContent(
|
||||
ctx context.Context,
|
||||
log *slog.Logger,
|
||||
client browser.BrowserClient,
|
||||
req browser.ContentRequest,
|
||||
maxAttempts int,
|
||||
baseDelay time.Duration,
|
||||
) (string, error) {
|
||||
var lastErr error
|
||||
delay := baseDelay
|
||||
for attempt := 1; attempt <= maxAttempts; attempt++ {
|
||||
html, err := client.GetContent(ctx, req)
|
||||
if err == nil {
|
||||
return html, nil
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
// Stop immediately on context cancellation.
|
||||
if ctx.Err() != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if attempt < maxAttempts {
|
||||
log.Warn("chapter fetch failed, retrying",
|
||||
"url", req.URL,
|
||||
"attempt", attempt,
|
||||
"max_attempts", maxAttempts,
|
||||
"retry_in", delay,
|
||||
"err", err,
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case <-time.After(delay):
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
}
|
||||
return "", lastErr
|
||||
}
|
||||
|
||||
func (s *Scraper) ScrapeChapterText(ctx context.Context, ref scraper.ChapterRef) (scraper.Chapter, error) {
|
||||
s.log.Debug("chapter text fetch starting",
|
||||
"chapter", ref.Number,
|
||||
"title", ref.Title,
|
||||
"payload_url", ref.URL,
|
||||
"payload_wait_selector", "#content",
|
||||
"payload_wait_selector_timeout_ms", 5000,
|
||||
)
|
||||
|
||||
raw, err := retryGetContent(ctx, s.log, s.client, browser.ContentRequest{
|
||||
URL: ref.URL,
|
||||
WaitFor: &browser.WaitForSelector{Selector: "#content", Timeout: 5000},
|
||||
RejectResourceTypes: rejectResourceTypes,
|
||||
GotoOptions: &browser.GotoOptions{Timeout: 60000},
|
||||
BestAttempt: true,
|
||||
}, 9, 6*time.Second)
|
||||
if err != nil {
|
||||
s.log.Debug("chapter text fetch failed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"err", err,
|
||||
)
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d fetch: %w", ref.Number, err)
|
||||
}
|
||||
if len(raw) > 0 {
|
||||
preview := raw
|
||||
if len(preview) > 500 {
|
||||
preview = preview[:500]
|
||||
}
|
||||
s.log.Debug("chapter text fetch partial content",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"response_bytes", len(raw),
|
||||
"preview", preview,
|
||||
)
|
||||
}
|
||||
s.log.Debug("chapter text fetch completed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"response_bytes", len(raw),
|
||||
)
|
||||
|
||||
root, err := htmlutil.ParseHTML(raw)
|
||||
if err != nil {
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d parse: %w", ref.Number, err)
|
||||
}
|
||||
|
||||
// <div id="content">…</div>
|
||||
container := htmlutil.FindFirst(root, scraper.Selector{ID: "content"})
|
||||
if container == nil {
|
||||
return scraper.Chapter{}, fmt.Errorf("chapter %d: #content container not found in %s", ref.Number, ref.URL)
|
||||
}
|
||||
|
||||
text := htmlutil.NodeToMarkdown(container)
|
||||
|
||||
s.log.Debug("chapter text parsed",
|
||||
"chapter", ref.Number,
|
||||
"url", ref.URL,
|
||||
"text_bytes", len(text),
|
||||
)
|
||||
|
||||
return scraper.Chapter{
|
||||
Ref: ref,
|
||||
Text: text,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ─── helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func resolveURL(base, href string) string {
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
b, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return base + href
|
||||
}
|
||||
ref, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return base + href
|
||||
}
|
||||
return b.ResolveReference(ref).String()
|
||||
}
|
||||
|
||||
func slugFromURL(bookURL string) string {
|
||||
u, err := url.Parse(bookURL)
|
||||
if err != nil {
|
||||
return bookURL
|
||||
}
|
||||
parts := strings.Split(strings.Trim(u.Path, "/"), "/")
|
||||
if len(parts) >= 2 && parts[0] == "book" {
|
||||
return parts[1]
|
||||
}
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseChapterCount(s string) int {
|
||||
// Formats: "123 Chapters", "1,234 Chapters", "123"
|
||||
s = strings.ReplaceAll(s, ",", "")
|
||||
fields := strings.Fields(s)
|
||||
if len(fields) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.Atoi(fields[0])
|
||||
return n
|
||||
}
|
||||
|
||||
// chapterNumberFromURL extracts the chapter number from a novelfire chapter URL.
|
||||
//
|
||||
// URL pattern: https://novelfire.net/book/{book-slug}/chapter-{N}
|
||||
// The last path segment is expected to be "chapter-{N}" or "{N}".
|
||||
// Returns 0 if no number can be parsed.
|
||||
func chapterNumberFromURL(chapterURL string) int {
|
||||
u, err := url.Parse(chapterURL)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
seg := path.Base(u.Path) // e.g. "chapter-42" or "42"
|
||||
// Strip a "chapter-" prefix if present.
|
||||
seg = strings.TrimPrefix(seg, "chapter-")
|
||||
// Also handle "chap-", "ch-" variants used by some sites.
|
||||
seg = strings.TrimPrefix(seg, "chap-")
|
||||
seg = strings.TrimPrefix(seg, "ch-")
|
||||
// Take only the leading digits (handles slugs like "42-title-text").
|
||||
digits := strings.FieldsFunc(seg, func(r rune) bool {
|
||||
return r < '0' || r > '9'
|
||||
})
|
||||
if len(digits) == 0 {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.Atoi(digits[0])
|
||||
return n
|
||||
}
|
||||
@@ -1,217 +0,0 @@
|
||||
package novelfire
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/libnovel/scraper/internal/browser"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
)
|
||||
|
||||
// ── stub browser client ───────────────────────────────────────────────────────
|
||||
|
||||
// stubClient is a BrowserClient that returns a fixed HTML string for every
|
||||
// GetContent call. ScrapePage and CDPSession are not used by these tests.
|
||||
type stubClient struct {
|
||||
html string
|
||||
}
|
||||
|
||||
func (s *stubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||||
|
||||
func (s *stubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||||
return s.html, nil
|
||||
}
|
||||
|
||||
func (s *stubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||||
return browser.ScrapeResponse{}, nil
|
||||
}
|
||||
|
||||
func (s *stubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// pagedStubClient returns a different HTML response for each successive call.
|
||||
// Once all pages are exhausted it returns an empty page (no chapter-list),
|
||||
// simulating the paginated chapter-list endpoint terminating correctly.
|
||||
type pagedStubClient struct {
|
||||
pages []string
|
||||
call int
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) Strategy() browser.Strategy { return browser.StrategyContent }
|
||||
|
||||
func (c *pagedStubClient) GetContent(_ context.Context, _ browser.ContentRequest) (string, error) {
|
||||
if c.call < len(c.pages) {
|
||||
html := c.pages[c.call]
|
||||
c.call++
|
||||
return html, nil
|
||||
}
|
||||
// Past the last page — return a page with no chapter-list to stop pagination.
|
||||
return `<!DOCTYPE html><html><body><div class="no-content"></div></body></html>`, nil
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) ScrapePage(_ context.Context, _ browser.ScrapeRequest) (browser.ScrapeResponse, error) {
|
||||
return browser.ScrapeResponse{}, nil
|
||||
}
|
||||
|
||||
func (c *pagedStubClient) CDPSession(_ context.Context, _ string, _ browser.CDPSessionFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func newScraper(html string) *Scraper {
|
||||
return New(&stubClient{html: html}, nil, &stubClient{html: html}, nil)
|
||||
}
|
||||
|
||||
func newPagedScraper(pages ...string) *Scraper {
|
||||
urlClient := &pagedStubClient{pages: pages}
|
||||
return New(&stubClient{}, nil, urlClient, nil)
|
||||
}
|
||||
|
||||
// ── ScrapeChapterText ─────────────────────────────────────────────────────────
|
||||
|
||||
func TestScrapeChapterText_ExtractsInnerText(t *testing.T) {
|
||||
html := `<!DOCTYPE html><html><body>
|
||||
<div id="content">
|
||||
<p>It was a dark and stormy night.</p>
|
||||
<p>The hero stepped forward.</p>
|
||||
</div>
|
||||
</body></html>`
|
||||
|
||||
s := newScraper(html)
|
||||
ref := scraper.ChapterRef{Number: 1, Title: "Chapter 1", URL: "https://novelfire.net/book/test-novel/chapter-1"}
|
||||
|
||||
ch, err := s.ScrapeChapterText(context.Background(), ref)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if ch.Ref.Number != 1 {
|
||||
t.Errorf("expected chapter number 1, got %d", ch.Ref.Number)
|
||||
}
|
||||
if !strings.Contains(ch.Text, "dark and stormy") {
|
||||
t.Errorf("expected chapter text to contain 'dark and stormy', got: %q", ch.Text)
|
||||
}
|
||||
if !strings.Contains(ch.Text, "hero stepped forward") {
|
||||
t.Errorf("expected chapter text to contain 'hero stepped forward', got: %q", ch.Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScrapeChapterText_MissingContainer(t *testing.T) {
|
||||
html := `<!DOCTYPE html><html><body><div class="other">nothing here</div></body></html>`
|
||||
|
||||
s := newScraper(html)
|
||||
ref := scraper.ChapterRef{Number: 2, Title: "Chapter 2", URL: "https://novelfire.net/book/test-novel/chapter-2"}
|
||||
|
||||
_, err := s.ScrapeChapterText(context.Background(), ref)
|
||||
if err == nil {
|
||||
t.Fatal("expected an error when #content container is missing, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// ── chapterNumberFromURL ──────────────────────────────────────────────────────
|
||||
|
||||
func TestChapterNumberFromURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
url string
|
||||
want int
|
||||
}{
|
||||
// Standard novelfire pattern.
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-1", 1},
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-26", 26},
|
||||
{"https://novelfire.net/book/a-dragon-against-the-whole-world/chapter-58", 58},
|
||||
// Large chapter numbers.
|
||||
{"https://novelfire.net/book/some-novel/chapter-1000", 1000},
|
||||
// Path segment with trailing slash.
|
||||
{"https://novelfire.net/book/some-novel/chapter-5/", 5},
|
||||
// Slug with title appended after the number (hypothetical future format).
|
||||
{"https://novelfire.net/book/some-novel/chapter-42-the-battle", 42},
|
||||
// Unparseable — should return 0 so the caller can fall back.
|
||||
{"https://novelfire.net/book/some-novel/prologue", 0},
|
||||
{"https://novelfire.net/book/some-novel/", 0},
|
||||
{"not-a-url", 0},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
got := chapterNumberFromURL(tc.url)
|
||||
if got != tc.want {
|
||||
t.Errorf("chapterNumberFromURL(%q) = %d, want %d", tc.url, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── ScrapeChapterList (position vs URL numbering) ─────────────────────────────
|
||||
|
||||
// TestScrapeChapterList_NumbersFromURL verifies that when the chapter list HTML
|
||||
// is served newest-first (as novelfire.net does), chapter numbers are still
|
||||
// assigned from the URL — not from list position — so that a re-run correctly
|
||||
// identifies which chapters are already on disk.
|
||||
func TestScrapeChapterList_NumbersFromURL(t *testing.T) {
|
||||
// Simulate a newest-first chapter list with 5 chapters on a single page.
|
||||
// Positions 1..5 correspond to chapters 5,4,3,2,1 in the site HTML.
|
||||
page1 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
s := newPagedScraper(page1)
|
||||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(refs) != 5 {
|
||||
t.Fatalf("expected 5 refs, got %d", len(refs))
|
||||
}
|
||||
|
||||
// With position-based numbering (the old bug), refs[0].Number would be 1
|
||||
// even though its URL is /chapter-5. With URL-based numbering it must be 5.
|
||||
wantNumbers := []int{5, 4, 3, 2, 1}
|
||||
for i, ref := range refs {
|
||||
if ref.Number != wantNumbers[i] {
|
||||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestScrapeChapterList_Pagination verifies that the scraper correctly follows
|
||||
// ?page=N pagination and stops when a page returns no chapter items.
|
||||
func TestScrapeChapterList_Pagination(t *testing.T) {
|
||||
page1 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-3">Chapter 3</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-2">Chapter 2</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-1">Chapter 1</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
page2 := `<!DOCTYPE html><html><body>
|
||||
<ul class="chapter-list">
|
||||
<li class="chapter-item"><a href="/book/test/chapter-6">Chapter 6</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-5">Chapter 5</a></li>
|
||||
<li class="chapter-item"><a href="/book/test/chapter-4">Chapter 4</a></li>
|
||||
</ul>
|
||||
</body></html>`
|
||||
|
||||
// page3 is omitted — pagedStubClient will return empty page to stop pagination.
|
||||
s := newPagedScraper(page1, page2)
|
||||
refs, err := s.ScrapeChapterList(context.Background(), "https://novelfire.net/book/test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(refs) != 6 {
|
||||
t.Fatalf("expected 6 refs (3 per page × 2 pages), got %d", len(refs))
|
||||
}
|
||||
|
||||
wantNumbers := []int{3, 2, 1, 6, 5, 4}
|
||||
for i, ref := range refs {
|
||||
if ref.Number != wantNumbers[i] {
|
||||
t.Errorf("refs[%d].Number = %d, want %d (URL: %s)", i, ref.Number, wantNumbers[i], ref.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,213 +0,0 @@
|
||||
// Package orchestrator coordinates the catalogue walk, metadata extraction,
|
||||
// chapter-list fetching, and parallel chapter scraping.
|
||||
//
|
||||
// Concurrency model
|
||||
// - One goroutine runs ScrapeCatalogue and feeds book URLs into a channel.
|
||||
// - For each book, a dedicated goroutine calls ScrapeMetadata (metadata goroutine).
|
||||
// - ScrapeChapterList is called in the metadata goroutine once metadata is done.
|
||||
// - N worker goroutines (default: runtime.NumCPU()) each pull ChapterRef values
|
||||
// from a shared work queue and call ScrapeChapterText.
|
||||
// - A sync.WaitGroup ensures all chapter workers finish before the orchestrator
|
||||
// signals completion.
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
// Config holds tunable parameters for the orchestrator.
|
||||
type Config struct {
|
||||
// Workers is the number of goroutines used to scrape chapters in parallel.
|
||||
// Defaults to runtime.NumCPU() when 0.
|
||||
Workers int
|
||||
|
||||
// StaticRoot is the path to the static/books output directory.
|
||||
StaticRoot string
|
||||
|
||||
// SingleBookURL when non-empty causes the orchestrator to scrape only
|
||||
// that one book instead of walking the full catalogue.
|
||||
SingleBookURL string
|
||||
}
|
||||
|
||||
// Orchestrator coordinates the full scrape pipeline.
|
||||
type Orchestrator struct {
|
||||
cfg Config
|
||||
novel scraper.NovelScraper
|
||||
writer *writer.Writer
|
||||
log *slog.Logger
|
||||
workers int
|
||||
}
|
||||
|
||||
// New returns a new Orchestrator.
|
||||
func New(cfg Config, novel scraper.NovelScraper, log *slog.Logger) *Orchestrator {
|
||||
workers := cfg.Workers
|
||||
if workers <= 0 {
|
||||
workers = runtime.NumCPU()
|
||||
}
|
||||
return &Orchestrator{
|
||||
cfg: cfg,
|
||||
novel: novel,
|
||||
writer: writer.New(cfg.StaticRoot),
|
||||
log: log,
|
||||
workers: workers,
|
||||
}
|
||||
}
|
||||
|
||||
// Run executes the full scrape pipeline and blocks until it is complete or ctx
|
||||
// is cancelled.
|
||||
func (o *Orchestrator) Run(ctx context.Context) error {
|
||||
o.log.Info("orchestrator starting",
|
||||
"source", o.novel.SourceName(),
|
||||
"workers", o.workers,
|
||||
"static_root", o.cfg.StaticRoot,
|
||||
)
|
||||
|
||||
// chapterWork is the shared queue consumed by chapter worker goroutines.
|
||||
type chapterJob struct {
|
||||
slug string
|
||||
ref scraper.ChapterRef
|
||||
}
|
||||
chapterWork := make(chan chapterJob, o.workers*4)
|
||||
|
||||
// Start chapter worker pool.
|
||||
var chapterWG sync.WaitGroup
|
||||
for i := 0; i < o.workers; i++ {
|
||||
chapterWG.Add(1)
|
||||
go func(workerID int) {
|
||||
defer chapterWG.Done()
|
||||
for job := range chapterWork {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
// Skip if already on disk.
|
||||
if o.writer.ChapterExists(job.slug, job.ref) {
|
||||
o.log.Debug("chapter already exists, skipping",
|
||||
"book", job.slug, "chapter", job.ref.Number)
|
||||
continue
|
||||
}
|
||||
|
||||
chapter, err := o.novel.ScrapeChapterText(ctx, job.ref)
|
||||
if err != nil {
|
||||
o.log.Error("chapter scrape failed",
|
||||
"book", job.slug,
|
||||
"chapter", job.ref.Number,
|
||||
"url", job.ref.URL,
|
||||
"err", err,
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := o.writer.WriteChapter(job.slug, chapter); err != nil {
|
||||
o.log.Error("chapter write failed",
|
||||
"book", job.slug,
|
||||
"chapter", job.ref.Number,
|
||||
"err", err,
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
o.log.Info("chapter saved",
|
||||
"book", job.slug,
|
||||
"chapter", job.ref.Number,
|
||||
"worker", workerID,
|
||||
)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
// processBook scrapes metadata + chapter list for one book, then enqueues
|
||||
// chapter jobs. It is called inside a goroutine per book.
|
||||
processBook := func(bookURL string) {
|
||||
// Metadata goroutine.
|
||||
meta, err := o.novel.ScrapeMetadata(ctx, bookURL)
|
||||
if err != nil {
|
||||
o.log.Error("metadata scrape failed", "url", bookURL, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Persist / update metadata.yaml.
|
||||
if err := o.writer.WriteMetadata(meta); err != nil {
|
||||
o.log.Error("metadata write failed", "slug", meta.Slug, "err", err)
|
||||
// Continue — chapters can still be scraped.
|
||||
}
|
||||
|
||||
o.log.Info("metadata saved", "slug", meta.Slug, "title", meta.Title)
|
||||
|
||||
// Fetch chapter list.
|
||||
refs, err := o.novel.ScrapeChapterList(ctx, bookURL)
|
||||
if err != nil {
|
||||
o.log.Error("chapter list scrape failed", "slug", meta.Slug, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
o.log.Info("chapter list fetched", "slug", meta.Slug, "chapters", len(refs))
|
||||
|
||||
// Enqueue chapter jobs.
|
||||
for _, ref := range refs {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case chapterWork <- chapterJob{slug: meta.Slug, ref: ref}:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if o.cfg.SingleBookURL != "" {
|
||||
// Single-book mode: skip catalogue entirely.
|
||||
o.log.Info("single-book mode", "url", o.cfg.SingleBookURL)
|
||||
processBook(o.cfg.SingleBookURL)
|
||||
} else {
|
||||
// Catalogue mode: stream every book.
|
||||
entries, catErrs := o.novel.ScrapeCatalogue(ctx)
|
||||
|
||||
// Drain catalogue errors in a separate goroutine.
|
||||
go func() {
|
||||
for err := range catErrs {
|
||||
o.log.Error("catalogue error", "err", err)
|
||||
}
|
||||
}()
|
||||
|
||||
var bookWG sync.WaitGroup
|
||||
for entry := range entries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break
|
||||
default:
|
||||
}
|
||||
|
||||
bookWG.Add(1)
|
||||
bookURL := entry.URL
|
||||
go func() {
|
||||
defer bookWG.Done()
|
||||
processBook(bookURL)
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait for all book goroutines to enqueue their chapters before
|
||||
// closing the chapter work queue.
|
||||
bookWG.Wait()
|
||||
}
|
||||
|
||||
// Signal chapter workers there is no more work.
|
||||
close(chapterWork)
|
||||
|
||||
// Wait for all in-flight chapter scrapes to finish.
|
||||
chapterWG.Wait()
|
||||
|
||||
if ctx.Err() != nil {
|
||||
return fmt.Errorf("orchestrator: context cancelled: %w", ctx.Err())
|
||||
}
|
||||
|
||||
o.log.Info("orchestrator finished")
|
||||
return nil
|
||||
}
|
||||
@@ -1,144 +0,0 @@
|
||||
// Package scraper defines the core interfaces and domain types for the libnovel
|
||||
// scraping system. Each novel source implements these interfaces; the orchestrator
|
||||
// wires them together without knowing anything about the concrete provider.
|
||||
package scraper
|
||||
|
||||
import "context"
|
||||
|
||||
// ─── Domain types ────────────────────────────────────────────────────────────
|
||||
|
||||
// BookMeta carries all bibliographic information about a novel.
|
||||
type BookMeta struct {
|
||||
// Slug is a URL-safe identifier derived from the book title, e.g. "a-dragon-against-the-whole-world".
|
||||
Slug string `yaml:"slug"`
|
||||
// Title is the human-readable novel title.
|
||||
Title string `yaml:"title"`
|
||||
// Author of the novel.
|
||||
Author string `yaml:"author"`
|
||||
// Cover is an absolute URL to the cover image.
|
||||
Cover string `yaml:"cover,omitempty"`
|
||||
// Status is e.g. "Ongoing", "Completed".
|
||||
Status string `yaml:"status,omitempty"`
|
||||
// Genres is a list of genre tags.
|
||||
Genres []string `yaml:"genres,omitempty"`
|
||||
// Summary is the full description/synopsis text.
|
||||
Summary string `yaml:"summary,omitempty"`
|
||||
// TotalChapters is the total number of chapters known at scrape time.
|
||||
TotalChapters int `yaml:"total_chapters,omitempty"`
|
||||
// SourceURL is the canonical URL of the book's landing page.
|
||||
SourceURL string `yaml:"source_url"`
|
||||
// Ranking is the rank number from ranking pages.
|
||||
Ranking int `yaml:"ranking,omitempty"`
|
||||
}
|
||||
|
||||
// CatalogueEntry is a lightweight reference returned by CatalogueProvider.
|
||||
type CatalogueEntry struct {
|
||||
// Title is the novel title as shown in the catalogue listing.
|
||||
Title string
|
||||
// URL is the canonical landing-page URL of the novel.
|
||||
URL string
|
||||
}
|
||||
|
||||
// ChapterRef is a reference to a single chapter returned by ChapterListProvider.
|
||||
type ChapterRef struct {
|
||||
// Number is the 1-based chapter index within the book.
|
||||
Number int
|
||||
// Title is the chapter display title.
|
||||
Title string
|
||||
// URL is the full URL of the chapter page.
|
||||
URL string
|
||||
// Volume is an optional volume number (0 means no volume grouping).
|
||||
Volume int
|
||||
}
|
||||
|
||||
// Chapter contains the fully-extracted text of a single chapter.
|
||||
type Chapter struct {
|
||||
Ref ChapterRef
|
||||
// Text is the plain / lightly-formatted chapter body (Markdown).
|
||||
Text string
|
||||
}
|
||||
|
||||
// ─── Scraping selector descriptors ───────────────────────────────────────────
|
||||
|
||||
// Selector describes how to locate an element in an HTML document.
|
||||
// Exactly one of Tag, Class, or ID should be non-empty; when multiple are set
|
||||
// they are combined (AND semantics).
|
||||
type Selector struct {
|
||||
// Tag is the HTML element name, e.g. "div", "p", "h1".
|
||||
Tag string
|
||||
// Class is one CSS class name (without the leading dot).
|
||||
Class string
|
||||
// ID is the element id attribute (without the leading #).
|
||||
ID string
|
||||
// Attr is an optional attribute name whose value should be extracted
|
||||
// instead of the text content (e.g. "href", "src").
|
||||
Attr string
|
||||
// Multiple indicates that all matching elements should be collected,
|
||||
// not just the first one.
|
||||
Multiple bool
|
||||
}
|
||||
|
||||
// ─── Provider interfaces ──────────────────────────────────────────────────────
|
||||
|
||||
// CatalogueProvider can enumerate every novel available on a source site.
|
||||
// It handles pagination transparently and streams CatalogueEntry values.
|
||||
type CatalogueProvider interface {
|
||||
// ScrapeCatalogue pages through the entire catalogue, sending
|
||||
// CatalogueEntry values to the returned channel. The channel is closed
|
||||
// when all pages have been scraped or ctx is cancelled.
|
||||
// Errors are surfaced via the error channel; a non-nil error does not
|
||||
// necessarily terminate scraping.
|
||||
ScrapeCatalogue(ctx context.Context) (<-chan CatalogueEntry, <-chan error)
|
||||
}
|
||||
|
||||
// MetadataProvider can extract structured book metadata from a novel's landing page.
|
||||
type MetadataProvider interface {
|
||||
// ScrapeMetadata fetches and parses the metadata for the book at bookURL.
|
||||
ScrapeMetadata(ctx context.Context, bookURL string) (BookMeta, error)
|
||||
}
|
||||
|
||||
// ChapterListProvider can enumerate all chapters of a book from the chapter-list page.
|
||||
type ChapterListProvider interface {
|
||||
// ScrapeChapterList returns all chapter references for a book, ordered
|
||||
// by chapter number ascending.
|
||||
ScrapeChapterList(ctx context.Context, bookURL string) ([]ChapterRef, error)
|
||||
}
|
||||
|
||||
// ChapterTextProvider can extract the readable text from a single chapter page.
|
||||
type ChapterTextProvider interface {
|
||||
// ScrapeChapterText fetches chapterURL and returns the chapter text as Markdown.
|
||||
ScrapeChapterText(ctx context.Context, ref ChapterRef) (Chapter, error)
|
||||
}
|
||||
|
||||
// RankingProvider can enumerate novels from a ranking page.
|
||||
type RankingProvider interface {
|
||||
// ScrapeRanking pages through up to maxPages ranking pages, sending BookMeta
|
||||
// values (with basic info like title, cover, genres, status, sourceURL) to
|
||||
// the returned channel. Pages are fetched sequentially and lazily: the next
|
||||
// page is only requested once all entries from the current page have been
|
||||
// sent. maxPages <= 0 means "all pages".
|
||||
ScrapeRanking(ctx context.Context, maxPages int) (<-chan BookMeta, <-chan error)
|
||||
}
|
||||
|
||||
// RankingPageCacher persists and retrieves raw HTML for individual ranking pages.
|
||||
// Implementations (e.g. writer.Writer) store files on disk so that a
|
||||
// subsequent ScrapeRanking call can serve cached HTML without a network round-trip.
|
||||
type RankingPageCacher interface {
|
||||
// WriteRankingPageCache stores the raw HTML string for the given page number.
|
||||
WriteRankingPageCache(page int, html string) error
|
||||
// ReadRankingPageCache returns the cached HTML for page, or ("", nil) on a miss.
|
||||
ReadRankingPageCache(page int) (string, error)
|
||||
}
|
||||
|
||||
// NovelScraper is the full interface that a concrete novel source must implement.
|
||||
// It composes all four provider interfaces.
|
||||
type NovelScraper interface {
|
||||
CatalogueProvider
|
||||
MetadataProvider
|
||||
ChapterListProvider
|
||||
ChapterTextProvider
|
||||
RankingProvider
|
||||
|
||||
// SourceName returns the human-readable name of this scraper, e.g. "novelfire.net".
|
||||
SourceName() string
|
||||
}
|
||||
@@ -1,505 +0,0 @@
|
||||
// Package server exposes the scraper as an HTTP service.
|
||||
//
|
||||
// Endpoints:
|
||||
//
|
||||
// POST /scrape — enqueue a full catalogue scrape
|
||||
// POST /scrape/book — enqueue a single-book scrape (JSON body: {"url":"..."})
|
||||
// GET /health — liveness probe
|
||||
package server
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/libnovel/scraper/internal/orchestrator"
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"github.com/libnovel/scraper/internal/writer"
|
||||
)
|
||||
|
||||
// Server wraps an HTTP mux with the scraping endpoints.
|
||||
type Server struct {
|
||||
addr string
|
||||
oCfg orchestrator.Config
|
||||
novel scraper.NovelScraper
|
||||
log *slog.Logger
|
||||
writer *writer.Writer
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
rankingRunning bool
|
||||
kokoroURL string // Kokoro-FastAPI base URL, e.g. http://kokoro:8880
|
||||
kokoroVoice string // default voice, e.g. af_bella
|
||||
|
||||
// voiceMu guards cachedVoices.
|
||||
voiceMu sync.RWMutex
|
||||
cachedVoices []string // populated on first request from Kokoro /v1/audio/voices
|
||||
|
||||
// audioMu guards audioCache and audioInFlight.
|
||||
// audioCache maps a cache key to the Kokoro download filename returned by
|
||||
// POST /v1/audio/speech with return_download_link=true.
|
||||
// audioInFlight deduplicates concurrent generation requests for the same key.
|
||||
audioMu sync.Mutex
|
||||
audioCache map[string]string // cacheKey → kokoro download filename
|
||||
audioInFlight map[string]chan struct{} // cacheKey → closed when done
|
||||
}
|
||||
|
||||
// New creates a new Server.
|
||||
func New(addr string, oCfg orchestrator.Config, novel scraper.NovelScraper, log *slog.Logger, kokoroURL, kokoroVoice string) *Server {
|
||||
return &Server{
|
||||
addr: addr,
|
||||
oCfg: oCfg,
|
||||
novel: novel,
|
||||
log: log,
|
||||
writer: writer.New(oCfg.StaticRoot),
|
||||
kokoroURL: kokoroURL,
|
||||
kokoroVoice: kokoroVoice,
|
||||
audioCache: make(map[string]string),
|
||||
audioInFlight: make(map[string]chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// voices returns the list of available Kokoro voices. On the first call it
|
||||
// fetches GET /v1/audio/voices from the Kokoro service and caches the result.
|
||||
// If the fetch fails (Kokoro not up yet, network error, etc.) it falls back to
|
||||
// the hardcoded kokoroVoices list so the UI is never empty.
|
||||
func (s *Server) voices() []string {
|
||||
s.voiceMu.RLock()
|
||||
cached := s.cachedVoices
|
||||
s.voiceMu.RUnlock()
|
||||
if len(cached) > 0 {
|
||||
return cached
|
||||
}
|
||||
|
||||
if s.kokoroURL != "" {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.kokoroURL+"/v1/audio/voices", nil)
|
||||
if err == nil {
|
||||
req.Header.Set("Accept", "application/json")
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err == nil {
|
||||
defer resp.Body.Close()
|
||||
var payload struct {
|
||||
Voices []string `json:"voices"`
|
||||
}
|
||||
if resp.StatusCode == http.StatusOK && json.NewDecoder(resp.Body).Decode(&payload) == nil && len(payload.Voices) > 0 {
|
||||
s.voiceMu.Lock()
|
||||
s.cachedVoices = payload.Voices
|
||||
s.voiceMu.Unlock()
|
||||
s.log.Info("fetched kokoro voices", "count", len(payload.Voices))
|
||||
return payload.Voices
|
||||
}
|
||||
}
|
||||
}
|
||||
s.log.Warn("could not fetch kokoro voices, using built-in list")
|
||||
}
|
||||
|
||||
return kokoroVoices
|
||||
}
|
||||
|
||||
// ListenAndServe starts the HTTP server and blocks until the provided context
|
||||
// is cancelled.
|
||||
func (s *Server) ListenAndServe(ctx context.Context) error {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /health", s.handleHealth)
|
||||
mux.HandleFunc("POST /scrape", s.handleScrapeCatalogue)
|
||||
mux.HandleFunc("POST /scrape/book", s.handleScrapeBook)
|
||||
// UI routes
|
||||
mux.HandleFunc("GET /", s.handleHome)
|
||||
mux.HandleFunc("GET /scrape", s.handleScrape)
|
||||
mux.HandleFunc("GET /ranking", s.handleRanking)
|
||||
mux.HandleFunc("POST /ranking/refresh", s.handleRankingRefresh)
|
||||
mux.HandleFunc("GET /ranking/view", s.handleRankingView)
|
||||
mux.HandleFunc("GET /books/{slug}", s.handleBook)
|
||||
mux.HandleFunc("GET /books/{slug}/chapters/{n}", s.handleChapter)
|
||||
mux.HandleFunc("GET /books/{slug}/chapters-page", s.handleBookChaptersPage)
|
||||
mux.HandleFunc("POST /ui/scrape/book", s.handleUIScrapeBook)
|
||||
mux.HandleFunc("GET /ui/scrape/status", s.handleUIScrapeStatus)
|
||||
mux.HandleFunc("GET /ui/ranking/status", s.handleRankingStatus)
|
||||
// Plain-text chapter content for browser-side TTS
|
||||
mux.HandleFunc("GET /ui/chapter-text/{slug}/{n}", s.handleChapterText)
|
||||
// Server-side audio generation via Kokoro /v1/audio/speech.
|
||||
// Generation can take several minutes, so wrap in its own timeout handler.
|
||||
audioGenHandler := http.TimeoutHandler(
|
||||
http.HandlerFunc(s.handleAudioGenerate),
|
||||
10*time.Minute,
|
||||
`{"error":"audio generation timed out"}`,
|
||||
)
|
||||
mux.Handle("POST /ui/audio/{slug}/{n}", audioGenHandler)
|
||||
// Status route: returns the proxy URL if audio was already generated, 404 otherwise.
|
||||
mux.HandleFunc("GET /ui/audio/{slug}/{n}", s.handleAudioStatus)
|
||||
// Proxy route: fetches the generated file from Kokoro /v1/download/{filename}.
|
||||
mux.HandleFunc("GET /ui/audio-proxy/{slug}/{n}", s.handleAudioProxy)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: s.addr,
|
||||
Handler: mux,
|
||||
ReadTimeout: 15 * time.Second,
|
||||
WriteTimeout: 60 * time.Second,
|
||||
IdleTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() { errCh <- srv.ListenAndServe() }()
|
||||
|
||||
s.log.Info("HTTP server listening", "addr", s.addr)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
return srv.Shutdown(shutCtx)
|
||||
case err := <-errCh:
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
|
||||
}
|
||||
|
||||
// handleChapterText returns the plain text of a chapter (markdown stripped)
|
||||
// for browser-side TTS. The browser POSTs this directly to Kokoro-FastAPI.
|
||||
func (s *Server) handleChapterText(w http.ResponseWriter, r *http.Request) {
|
||||
slug := r.PathValue("slug")
|
||||
n, err := strconv.Atoi(r.PathValue("n"))
|
||||
if err != nil || n < 1 {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
raw, err := s.writer.ReadChapter(slug, n)
|
||||
if err != nil {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, stripMarkdown(raw))
|
||||
}
|
||||
|
||||
// ─── Audio generation via Kokoro /v1/audio/speech ────────────────────────────
|
||||
//
|
||||
// handleAudioGenerate handles POST /ui/audio/{slug}/{n}.
|
||||
//
|
||||
// It calls Kokoro's POST /v1/audio/speech with return_download_link=true.
|
||||
// Kokoro generates the audio, saves it to its own temp storage, and returns
|
||||
// the download filename in the X-Download-Path response header.
|
||||
// We cache that filename (in memory, keyed by slug/chapter/voice/speed) and
|
||||
// return a proxy URL that the browser sets as audio.src.
|
||||
//
|
||||
// On a cache hit the proxy URL is returned immediately without re-generating.
|
||||
// Concurrent requests for the same key are deduplicated.
|
||||
func (s *Server) handleAudioGenerate(w http.ResponseWriter, r *http.Request) {
|
||||
slug := r.PathValue("slug")
|
||||
n, err := strconv.Atoi(r.PathValue("n"))
|
||||
if err != nil || n < 1 {
|
||||
http.Error(w, `{"error":"invalid chapter"}`, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse optional voice/speed from JSON body.
|
||||
voice := s.kokoroVoice
|
||||
speed := 1.0
|
||||
var body struct {
|
||||
Voice string `json:"voice"`
|
||||
Speed float64 `json:"speed"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
}
|
||||
if body.Voice != "" {
|
||||
voice = body.Voice
|
||||
}
|
||||
if body.Speed > 0 {
|
||||
speed = body.Speed
|
||||
}
|
||||
|
||||
cacheKey := fmt.Sprintf("%s/%d/%s/%.2f", slug, n, voice, speed)
|
||||
|
||||
// Fast path: already generated this session.
|
||||
s.audioMu.Lock()
|
||||
if filename, ok := s.audioCache[cacheKey]; ok {
|
||||
s.audioMu.Unlock()
|
||||
s.writeAudioResponse(w, slug, n, voice, speed, filename)
|
||||
return
|
||||
}
|
||||
|
||||
// Deduplicate concurrent generation for the same key.
|
||||
if ch, ok := s.audioInFlight[cacheKey]; ok {
|
||||
s.audioMu.Unlock()
|
||||
select {
|
||||
case <-ch:
|
||||
case <-r.Context().Done():
|
||||
http.Error(w, `{"error":"request cancelled"}`, http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
s.audioMu.Lock()
|
||||
filename, ok := s.audioCache[cacheKey]
|
||||
s.audioMu.Unlock()
|
||||
if ok {
|
||||
s.writeAudioResponse(w, slug, n, voice, speed, filename)
|
||||
} else {
|
||||
http.Error(w, `{"error":"audio generation failed"}`, http.StatusInternalServerError)
|
||||
}
|
||||
return
|
||||
}
|
||||
ch := make(chan struct{})
|
||||
s.audioInFlight[cacheKey] = ch
|
||||
s.audioMu.Unlock()
|
||||
|
||||
defer func() {
|
||||
s.audioMu.Lock()
|
||||
delete(s.audioInFlight, cacheKey)
|
||||
s.audioMu.Unlock()
|
||||
close(ch)
|
||||
}()
|
||||
|
||||
// Load and validate chapter text.
|
||||
raw, err := s.writer.ReadChapter(slug, n)
|
||||
if err != nil {
|
||||
http.Error(w, `{"error":"chapter not found"}`, http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
text := stripMarkdown(raw)
|
||||
if text == "" {
|
||||
http.Error(w, `{"error":"chapter text is empty"}`, http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
if s.kokoroURL == "" {
|
||||
http.Error(w, `{"error":"kokoro not configured"}`, http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
// Call Kokoro POST /v1/audio/speech with return_download_link=true.
|
||||
// Kokoro saves the generated audio to its own temp storage and returns the
|
||||
// download path in the X-Download-Path response header.
|
||||
filename, err := s.generateSpeech(r.Context(), text, voice, speed)
|
||||
if err != nil {
|
||||
s.log.Error("kokoro speech generation failed", "slug", slug, "chapter", n, "err", err)
|
||||
http.Error(w, `{"error":"speech generation failed"}`, http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
|
||||
s.audioMu.Lock()
|
||||
s.audioCache[cacheKey] = filename
|
||||
s.audioMu.Unlock()
|
||||
|
||||
s.log.Info("audio generated", "slug", slug, "chapter", n, "filename", filename)
|
||||
s.writeAudioResponse(w, slug, n, voice, speed, filename)
|
||||
}
|
||||
|
||||
// generateSpeech calls POST /v1/audio/speech on Kokoro with return_download_link=true
|
||||
// and returns the filename from the X-Download-Path response header.
|
||||
func (s *Server) generateSpeech(ctx context.Context, text, voice string, speed float64) (string, error) {
|
||||
reqBody, _ := json.Marshal(map[string]interface{}{
|
||||
"model": "kokoro",
|
||||
"input": text,
|
||||
"voice": voice,
|
||||
"response_format": "mp3",
|
||||
"speed": speed,
|
||||
"stream": false,
|
||||
"return_download_link": true,
|
||||
})
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
s.kokoroURL+"/v1/audio/speech", bytes.NewReader(reqBody))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("kokoro request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
// Drain body so the connection can be reused.
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("kokoro status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// X-Download-Path is e.g. "/download/speech_abc123.mp3"
|
||||
dlPath := resp.Header.Get("X-Download-Path")
|
||||
if dlPath == "" {
|
||||
return "", fmt.Errorf("kokoro did not return X-Download-Path header")
|
||||
}
|
||||
|
||||
// Extract just the filename from the path.
|
||||
filename := dlPath
|
||||
if idx := strings.LastIndex(dlPath, "/"); idx >= 0 {
|
||||
filename = dlPath[idx+1:]
|
||||
}
|
||||
if filename == "" {
|
||||
return "", fmt.Errorf("empty filename in X-Download-Path: %q", dlPath)
|
||||
}
|
||||
return filename, nil
|
||||
}
|
||||
|
||||
// writeAudioResponse writes the JSON response for a generated audio chapter.
|
||||
// The URL points to our proxy handler which fetches from Kokoro on demand.
|
||||
func (s *Server) writeAudioResponse(w http.ResponseWriter, slug string, n int, voice string, speed float64, filename string) {
|
||||
proxyURL := fmt.Sprintf("/ui/audio-proxy/%s/%d?voice=%s&speed=%.1f", slug, n, voice, speed)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"url": proxyURL,
|
||||
"filename": filename,
|
||||
})
|
||||
}
|
||||
|
||||
// handleAudioStatus handles GET /ui/audio/{slug}/{n}.
|
||||
// Returns the proxy URL if audio was already generated this session, 404 otherwise.
|
||||
func (s *Server) handleAudioStatus(w http.ResponseWriter, r *http.Request) {
|
||||
slug := r.PathValue("slug")
|
||||
n, err := strconv.Atoi(r.PathValue("n"))
|
||||
if err != nil || n < 1 {
|
||||
http.Error(w, `{"error":"invalid chapter"}`, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
voice := r.URL.Query().Get("voice")
|
||||
if voice == "" {
|
||||
voice = s.kokoroVoice
|
||||
}
|
||||
speedStr := r.URL.Query().Get("speed")
|
||||
speed := 1.0
|
||||
if speedStr != "" {
|
||||
if v, err := strconv.ParseFloat(speedStr, 64); err == nil && v > 0 {
|
||||
speed = v
|
||||
}
|
||||
}
|
||||
|
||||
cacheKey := fmt.Sprintf("%s/%d/%s/%.2f", slug, n, voice, speed)
|
||||
s.audioMu.Lock()
|
||||
filename, ok := s.audioCache[cacheKey]
|
||||
s.audioMu.Unlock()
|
||||
|
||||
if !ok {
|
||||
http.Error(w, `{"error":"not generated"}`, http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
s.writeAudioResponse(w, slug, n, voice, speed, filename)
|
||||
}
|
||||
|
||||
// handleAudioProxy handles GET /ui/audio-proxy/{slug}/{n}.
|
||||
// It looks up the Kokoro download filename for this chapter (voice/speed) and
|
||||
// proxies GET /v1/download/{filename} from the Kokoro server back to the browser.
|
||||
func (s *Server) handleAudioProxy(w http.ResponseWriter, r *http.Request) {
|
||||
slug := r.PathValue("slug")
|
||||
n, err := strconv.Atoi(r.PathValue("n"))
|
||||
if err != nil || n < 1 {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
voice := r.URL.Query().Get("voice")
|
||||
if voice == "" {
|
||||
voice = s.kokoroVoice
|
||||
}
|
||||
speedStr := r.URL.Query().Get("speed")
|
||||
speed := 1.0
|
||||
if speedStr != "" {
|
||||
if v, err := strconv.ParseFloat(speedStr, 64); err == nil && v > 0 {
|
||||
speed = v
|
||||
}
|
||||
}
|
||||
|
||||
cacheKey := fmt.Sprintf("%s/%d/%s/%.2f", slug, n, voice, speed)
|
||||
s.audioMu.Lock()
|
||||
filename, ok := s.audioCache[cacheKey]
|
||||
s.audioMu.Unlock()
|
||||
|
||||
if !ok {
|
||||
http.Error(w, "audio not generated yet", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
kokoroURL := s.kokoroURL + "/v1/download/" + filename
|
||||
req, err := http.NewRequestWithContext(r.Context(), http.MethodGet, kokoroURL, nil)
|
||||
if err != nil {
|
||||
http.Error(w, "failed to build proxy request", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
http.Error(w, "kokoro download failed", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
http.Error(w, fmt.Sprintf("kokoro returned %d", resp.StatusCode), http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "audio/mpeg")
|
||||
w.Header().Set("Cache-Control", "public, max-age=3600")
|
||||
if cl := resp.Header.Get("Content-Length"); cl != "" {
|
||||
w.Header().Set("Content-Length", cl)
|
||||
}
|
||||
_, _ = io.Copy(w, resp.Body)
|
||||
}
|
||||
|
||||
func (s *Server) handleScrapeCatalogue(w http.ResponseWriter, r *http.Request) {
|
||||
cfg := s.oCfg
|
||||
cfg.SingleBookURL = "" // full catalogue
|
||||
|
||||
s.runAsync(w, cfg)
|
||||
}
|
||||
|
||||
func (s *Server) handleScrapeBook(w http.ResponseWriter, r *http.Request) {
|
||||
var body struct {
|
||||
URL string `json:"url"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.URL == "" {
|
||||
http.Error(w, `{"error":"request body must be JSON with \"url\" field"}`, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
cfg := s.oCfg
|
||||
cfg.SingleBookURL = body.URL
|
||||
|
||||
s.runAsync(w, cfg)
|
||||
}
|
||||
|
||||
// runAsync launches an orchestrator in the background and returns 202 Accepted.
|
||||
// Only one scrape job runs at a time; concurrent requests receive 409 Conflict.
|
||||
func (s *Server) runAsync(w http.ResponseWriter, cfg orchestrator.Config) {
|
||||
s.mu.Lock()
|
||||
if s.running {
|
||||
s.mu.Unlock()
|
||||
http.Error(w, `{"error":"a scrape job is already running"}`, http.StatusConflict)
|
||||
return
|
||||
}
|
||||
s.running = true
|
||||
s.mu.Unlock()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusAccepted)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"status": "accepted"})
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
s.mu.Lock()
|
||||
s.running = false
|
||||
s.mu.Unlock()
|
||||
}()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour)
|
||||
defer cancel()
|
||||
|
||||
o := orchestrator.New(cfg, s.novel, s.log)
|
||||
if err := o.Run(ctx); err != nil {
|
||||
s.log.Error("scrape job failed", "err", fmt.Sprintf("%v", err))
|
||||
}
|
||||
}()
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,476 +0,0 @@
|
||||
// Package writer handles persistence of scraped chapters and metadata.
|
||||
//
|
||||
// Directory layout:
|
||||
//
|
||||
// static/books/
|
||||
// ├── {book-slug}/
|
||||
// │ ├── metadata.yaml
|
||||
// │ ├── vol-0/ (no volume grouping)
|
||||
// │ │ ├── 1-50/
|
||||
// │ │ │ ├── chapter-1.md
|
||||
// │ │ │ └── …
|
||||
// │ │ └── 51-100/
|
||||
// │ │ └── …
|
||||
// │ └── vol-1/
|
||||
// │ └── …
|
||||
package writer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/libnovel/scraper/internal/scraper"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
const chaptersPerFolder = 50
|
||||
|
||||
// Writer persists scraped content under a configurable root directory.
|
||||
type Writer struct {
|
||||
root string // e.g. "./static/books"
|
||||
}
|
||||
|
||||
// New creates a Writer that stores files under root.
|
||||
func New(root string) *Writer {
|
||||
return &Writer{root: root}
|
||||
}
|
||||
|
||||
// ─── Metadata ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// WriteMetadata serialises meta to static/books/{slug}/metadata.yaml.
|
||||
// It creates the directory if it does not exist and overwrites any existing file.
|
||||
func (w *Writer) WriteMetadata(meta scraper.BookMeta) error {
|
||||
dir := w.bookDir(meta.Slug)
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
||||
}
|
||||
|
||||
path := filepath.Join(dir, "metadata.yaml")
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("writer: create metadata %s: %w", path, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
enc := yaml.NewEncoder(f)
|
||||
enc.SetIndent(2)
|
||||
if err := enc.Encode(meta); err != nil {
|
||||
return fmt.Errorf("writer: encode metadata: %w", err)
|
||||
}
|
||||
return enc.Close()
|
||||
}
|
||||
|
||||
// ReadMetadata reads the metadata.yaml for slug if it exists.
|
||||
// Returns (zero-value, false, nil) when the file does not exist.
|
||||
func (w *Writer) ReadMetadata(slug string) (scraper.BookMeta, bool, error) {
|
||||
path := filepath.Join(w.bookDir(slug), "metadata.yaml")
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return scraper.BookMeta{}, false, nil
|
||||
}
|
||||
return scraper.BookMeta{}, false, fmt.Errorf("writer: read metadata %s: %w", path, err)
|
||||
}
|
||||
|
||||
var meta scraper.BookMeta
|
||||
if err := yaml.Unmarshal(data, &meta); err != nil {
|
||||
return scraper.BookMeta{}, true, fmt.Errorf("writer: unmarshal metadata %s: %w", path, err)
|
||||
}
|
||||
return meta, true, nil
|
||||
}
|
||||
|
||||
// MetadataMtime returns the modification time (Unix seconds) of the
|
||||
// metadata.yaml file for slug, or 0 if the file cannot be stat'd.
|
||||
func (w *Writer) MetadataMtime(slug string) int64 {
|
||||
path := filepath.Join(w.bookDir(slug), "metadata.yaml")
|
||||
fi, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return fi.ModTime().Unix()
|
||||
}
|
||||
|
||||
// ─── Chapters ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// ChapterExists returns true if the markdown file for ref already exists on disk.
|
||||
func (w *Writer) ChapterExists(slug string, ref scraper.ChapterRef) bool {
|
||||
_, err := os.Stat(w.chapterPath(slug, ref))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// WriteChapter writes chapter.Text to the appropriate markdown file.
|
||||
// The parent directories are created on demand.
|
||||
func (w *Writer) WriteChapter(slug string, chapter scraper.Chapter) error {
|
||||
path := w.chapterPath(slug, chapter.Ref)
|
||||
dir := filepath.Dir(path)
|
||||
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
||||
}
|
||||
|
||||
// Build the markdown document.
|
||||
var sb strings.Builder
|
||||
sb.WriteString("# ")
|
||||
sb.WriteString(chapter.Ref.Title)
|
||||
sb.WriteString("\n\n")
|
||||
sb.WriteString(chapter.Text)
|
||||
sb.WriteString("\n")
|
||||
|
||||
if err := os.WriteFile(path, []byte(sb.String()), 0o644); err != nil {
|
||||
return fmt.Errorf("writer: write chapter %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ─── Catalogue helpers ────────────────────────────────────────────────────────
|
||||
|
||||
// ListBooks returns metadata for every book that has a metadata.yaml under root.
|
||||
// Books with unreadable metadata files are silently skipped.
|
||||
func (w *Writer) ListBooks() ([]scraper.BookMeta, error) {
|
||||
entries, err := os.ReadDir(w.root)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("writer: list books: %w", err)
|
||||
}
|
||||
var books []scraper.BookMeta
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
meta, ok, _ := w.ReadMetadata(e.Name())
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
books = append(books, meta)
|
||||
}
|
||||
sort.Slice(books, func(i, j int) bool {
|
||||
return books[i].Title < books[j].Title
|
||||
})
|
||||
return books, nil
|
||||
}
|
||||
|
||||
// LocalSlugs returns the set of book slugs that have a metadata.yaml on disk.
|
||||
// It is cheaper than ListBooks because it only checks for file existence rather
|
||||
// than fully parsing every YAML file.
|
||||
func (w *Writer) LocalSlugs() map[string]bool {
|
||||
entries, err := os.ReadDir(w.root)
|
||||
if err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
slugs := make(map[string]bool, len(entries))
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
metaPath := filepath.Join(w.root, e.Name(), "metadata.yaml")
|
||||
if _, err := os.Stat(metaPath); err == nil {
|
||||
slugs[e.Name()] = true
|
||||
}
|
||||
}
|
||||
return slugs
|
||||
}
|
||||
|
||||
// ChapterInfo is a lightweight chapter descriptor derived from on-disk files.
|
||||
type ChapterInfo struct {
|
||||
Number int
|
||||
Title string // chapter name, cleaned of number prefix and trailing date
|
||||
Date string // relative date scraped alongside the title, e.g. "1 year ago"
|
||||
}
|
||||
|
||||
// ListChapters returns all chapters on disk for slug, sorted by number.
|
||||
func (w *Writer) ListChapters(slug string) ([]ChapterInfo, error) {
|
||||
bookDir := w.bookDir(slug)
|
||||
var chapters []ChapterInfo
|
||||
|
||||
// Walk vol-*/range-*/ directories.
|
||||
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("writer: list chapters glob: %w", err)
|
||||
}
|
||||
for _, vd := range volDirs {
|
||||
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
|
||||
for _, rd := range rangeDirs {
|
||||
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
|
||||
for _, f := range files {
|
||||
base := filepath.Base(f) // chapter-N.md
|
||||
numStr := strings.TrimSuffix(strings.TrimPrefix(base, "chapter-"), ".md")
|
||||
n, err := strconv.Atoi(numStr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
title, date := chapterTitle(f, n)
|
||||
chapters = append(chapters, ChapterInfo{Number: n, Title: title, Date: date})
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Slice(chapters, func(i, j int) bool {
|
||||
return chapters[i].Number < chapters[j].Number
|
||||
})
|
||||
return chapters, nil
|
||||
}
|
||||
|
||||
// CountChapters returns the number of chapter markdown files on disk for slug.
|
||||
// It is cheaper than ListChapters because it does not read file contents.
|
||||
func (w *Writer) CountChapters(slug string) int {
|
||||
bookDir := w.bookDir(slug)
|
||||
volDirs, err := filepath.Glob(filepath.Join(bookDir, "vol-*"))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
count := 0
|
||||
for _, vd := range volDirs {
|
||||
rangeDirs, _ := filepath.Glob(filepath.Join(vd, "*-*"))
|
||||
for _, rd := range rangeDirs {
|
||||
files, _ := filepath.Glob(filepath.Join(rd, "chapter-*.md"))
|
||||
count += len(files)
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// chapterTitle reads the first non-empty line of a markdown file and strips
|
||||
// the leading "# " heading marker. Falls back to "Chapter N".
|
||||
func chapterTitle(path string, n int) (title, date string) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Sprintf("Chapter %d", n), ""
|
||||
}
|
||||
for _, line := range strings.SplitN(string(data), "\n", 10) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
line = strings.TrimPrefix(line, "# ")
|
||||
return SplitChapterTitle(line)
|
||||
}
|
||||
return fmt.Sprintf("Chapter %d", n), ""
|
||||
}
|
||||
|
||||
// SplitChapterTitle separates the human-readable chapter name from the
|
||||
// trailing relative-date string that novelfire.net appends to the heading.
|
||||
// Examples of raw heading text (after stripping "# "):
|
||||
//
|
||||
// "1 Chapter 1 - 1: The Academy's Weakest1 year ago"
|
||||
// "2 Chapter 2 - Enter the Storm3 months ago"
|
||||
//
|
||||
// The pattern is: optional leading number+whitespace, then the real title,
|
||||
// then a date that matches /\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago$/
|
||||
func SplitChapterTitle(raw string) (title, date string) {
|
||||
// Strip a leading chapter-number index that novelfire sometimes prepends.
|
||||
// It looks like "1 " or "12 " at the very start.
|
||||
raw = strings.TrimSpace(raw)
|
||||
if idx := strings.IndexFunc(raw, func(r rune) bool { return r == ' ' || r == '\t' }); idx > 0 {
|
||||
prefix := raw[:idx]
|
||||
allDigit := true
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
allDigit = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allDigit {
|
||||
raw = strings.TrimSpace(raw[idx:])
|
||||
}
|
||||
}
|
||||
|
||||
// Strip "Chapter N - N: " prefix (novelfire double-number format).
|
||||
// Also handles "Chapter N: " (single number) and "Chapter N - Title" without colon.
|
||||
chNumRe := regexp.MustCompile(`(?i)^chapter\s+\d+(?:\s*-\s*\d+)?\s*:\s*`)
|
||||
raw = strings.TrimSpace(chNumRe.ReplaceAllString(raw, ""))
|
||||
|
||||
// Match a trailing relative date: "<n> <unit>[s] ago"
|
||||
dateRe := regexp.MustCompile(`\s*(\d+\s+(?:second|minute|hour|day|week|month|year)s?\s+ago)\s*$`)
|
||||
if m := dateRe.FindStringSubmatchIndex(raw); m != nil {
|
||||
return strings.TrimSpace(raw[:m[0]]), strings.TrimSpace(raw[m[2]:m[3]])
|
||||
}
|
||||
return raw, ""
|
||||
}
|
||||
|
||||
// ReadChapter returns the raw markdown content for chapter number n of slug.
|
||||
func (w *Writer) ReadChapter(slug string, n int) (string, error) {
|
||||
// Reconstruct path using the same bucketing formula as chapterPath.
|
||||
ref := scraper.ChapterRef{Number: n, Volume: 0}
|
||||
path := w.chapterPath(slug, ref)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("writer: read chapter %d: %w", n, err)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// ─── Ranking ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// RankingItem represents a single entry in the ranking.
|
||||
type RankingItem struct {
|
||||
Rank int `yaml:"rank" json:"rank"`
|
||||
Slug string `yaml:"slug" json:"slug"`
|
||||
Title string `yaml:"title" json:"title"`
|
||||
Author string `yaml:"author,omitempty" json:"author,omitempty"`
|
||||
Cover string `yaml:"cover,omitempty" json:"cover,omitempty"`
|
||||
Status string `yaml:"status,omitempty" json:"status,omitempty"`
|
||||
Genres []string `yaml:"genres,omitempty" json:"genres,omitempty"`
|
||||
SourceURL string `yaml:"source_url,omitempty" json:"source_url,omitempty"`
|
||||
}
|
||||
|
||||
// WriteRanking saves the ranking items as JSON to static/books/ranking.json.
|
||||
// This replaces the old markdown table format with a structured format that
|
||||
// is faster to read back (no custom parsing) and safe for titles containing "|".
|
||||
func (w *Writer) WriteRanking(items []RankingItem) error {
|
||||
path := filepath.Clean(w.rankingPath())
|
||||
dir := filepath.Dir(path)
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("writer: mkdir %s: %w", dir, err)
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(items, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("writer: marshal ranking: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(path, data, 0o644); err != nil {
|
||||
return fmt.Errorf("writer: write ranking %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReadRankingItems parses ranking.json into a slice of RankingItem.
|
||||
// Returns nil slice (not an error) when the file does not exist yet.
|
||||
func (w *Writer) ReadRankingItems() ([]RankingItem, error) {
|
||||
data, err := os.ReadFile(w.rankingPath())
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("writer: read ranking: %w", err)
|
||||
}
|
||||
var items []RankingItem
|
||||
if err := json.Unmarshal(data, &items); err != nil {
|
||||
return nil, fmt.Errorf("writer: parse ranking json: %w", err)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
// RankingFileInfo returns os.FileInfo for the ranking.json file, if it exists.
|
||||
func (w *Writer) RankingFileInfo() (os.FileInfo, error) {
|
||||
return os.Stat(w.rankingPath())
|
||||
}
|
||||
|
||||
func (w *Writer) rankingPath() string {
|
||||
return filepath.Join(w.root, "ranking.json")
|
||||
}
|
||||
|
||||
// ─── Ranking page HTML cache ──────────────────────────────────────────────────
|
||||
|
||||
// rankingCacheDir returns the directory that stores per-page HTML caches.
|
||||
func (w *Writer) rankingCacheDir() string {
|
||||
return filepath.Join(w.root, "_ranking_cache")
|
||||
}
|
||||
|
||||
// rankingPageCachePath returns the path for a cached ranking page HTML file.
|
||||
func (w *Writer) rankingPageCachePath(page int) string {
|
||||
return filepath.Join(w.rankingCacheDir(), fmt.Sprintf("page-%d.html", page))
|
||||
}
|
||||
|
||||
// WriteRankingPageCache persists raw HTML for the given ranking page number.
|
||||
func (w *Writer) WriteRankingPageCache(page int, html string) error {
|
||||
dir := w.rankingCacheDir()
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("writer: mkdir ranking cache %s: %w", dir, err)
|
||||
}
|
||||
path := w.rankingPageCachePath(page)
|
||||
if err := os.WriteFile(path, []byte(html), 0o644); err != nil {
|
||||
return fmt.Errorf("writer: write ranking page cache %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReadRankingPageCache reads the cached HTML for the given ranking page.
|
||||
// Returns ("", nil) when no cache file exists yet.
|
||||
func (w *Writer) ReadRankingPageCache(page int) (string, error) {
|
||||
data, err := os.ReadFile(w.rankingPageCachePath(page))
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return "", nil
|
||||
}
|
||||
return "", fmt.Errorf("writer: read ranking page cache page %d: %w", page, err)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// RankingPageCacheInfo returns os.FileInfo for a cached ranking page file.
|
||||
// Returns (nil, nil) when the file does not exist.
|
||||
func (w *Writer) RankingPageCacheInfo(page int) (os.FileInfo, error) {
|
||||
info, err := os.Stat(w.rankingPageCachePath(page))
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return info, nil
|
||||
}
|
||||
|
||||
// bookDir returns the root directory for a book slug.
|
||||
func (w *Writer) bookDir(slug string) string {
|
||||
return filepath.Join(w.root, slug)
|
||||
}
|
||||
|
||||
// AudioDir returns the directory used to cache generated MP3 files for a book.
|
||||
func (w *Writer) AudioDir(slug string) string {
|
||||
return filepath.Join(w.bookDir(slug), "audio")
|
||||
}
|
||||
|
||||
// AudioPath returns the full path for a cached chapter audio file.
|
||||
// The filename is keyed by chapter number, voice, and speed so that different
|
||||
// settings never collide. Speed is formatted to one decimal place (e.g. "1.0").
|
||||
func (w *Writer) AudioPath(slug string, n int, voice string, speed float64) string {
|
||||
safeVoice := sanitiseVoice(voice)
|
||||
filename := fmt.Sprintf("ch%d-%s-%.1f.mp3", n, safeVoice, speed)
|
||||
return filepath.Join(w.AudioDir(slug), filename)
|
||||
}
|
||||
|
||||
// AudioPartPath returns the path for an individual audio chunk generated during
|
||||
// chunked TTS. Part files are named ch{n}-{voice}-{speed}.part{p}.mp3 and are
|
||||
// deleted after they have been merged into the final AudioPath file.
|
||||
func (w *Writer) AudioPartPath(slug string, n int, voice string, speed float64, part int) string {
|
||||
safeVoice := sanitiseVoice(voice)
|
||||
filename := fmt.Sprintf("ch%d-%s-%.1f.part%d.mp3", n, safeVoice, speed, part)
|
||||
return filepath.Join(w.AudioDir(slug), filename)
|
||||
}
|
||||
|
||||
// sanitiseVoice converts a voice name into a string that is safe to embed in a
|
||||
// filename (only a-z, A-Z, 0-9, '_', '-' are kept; everything else becomes '_').
|
||||
func sanitiseVoice(voice string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
|
||||
return r
|
||||
}
|
||||
return '_'
|
||||
}, voice)
|
||||
}
|
||||
|
||||
// chapterPath computes the full file path for a chapter.
|
||||
//
|
||||
// vol-{volume}/{folderRange}/chapter-{number}.md
|
||||
//
|
||||
// Example: vol-0/1-50/chapter-1.md, vol-0/51-100/chapter-51.md
|
||||
func (w *Writer) chapterPath(slug string, ref scraper.ChapterRef) string {
|
||||
vol := ref.Volume // 0 == no volume grouping
|
||||
volDir := fmt.Sprintf("vol-%d", vol)
|
||||
|
||||
// Folder group: chapters 1-50 → "1-50", 51-100 → "51-100", …
|
||||
lo := ((ref.Number-1)/chaptersPerFolder)*chaptersPerFolder + 1
|
||||
hi := lo + chaptersPerFolder - 1
|
||||
rangeDir := fmt.Sprintf("%d-%d", lo, hi)
|
||||
|
||||
filename := fmt.Sprintf("chapter-%d.md", ref.Number)
|
||||
|
||||
return filepath.Join(w.bookDir(slug), volDir, rangeDir, filename)
|
||||
}
|
||||
BIN
scraper/scraper
BIN
scraper/scraper
Binary file not shown.
423
scripts/e2e-test.mjs
Normal file
423
scripts/e2e-test.mjs
Normal file
@@ -0,0 +1,423 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* e2e-test.mjs — End-to-end tests for the LibNovel v3 stack.
|
||||
*
|
||||
* Hits live services via https://localhost (self-signed cert, TLS verify skipped).
|
||||
* Requires: Node 18+ (built-in fetch with TLS options via --experimental-fetch or
|
||||
* native in Node 21+). Run with: node --experimental-vm-modules scripts/e2e-test.mjs
|
||||
* or simply: node scripts/e2e-test.mjs
|
||||
*
|
||||
* Services tested:
|
||||
* - Caddy / UI https://localhost
|
||||
* - Go backend via UI proxy routes
|
||||
* - PocketBase via UI server-side (indirect)
|
||||
*
|
||||
* Usage:
|
||||
* node scripts/e2e-test.mjs
|
||||
* node scripts/e2e-test.mjs --verbose
|
||||
*/
|
||||
|
||||
import { createServer } from 'node:https';
|
||||
import { request as httpRequest } from 'node:https';
|
||||
import { URL } from 'node:url';
|
||||
|
||||
const BASE = 'https://localhost';
|
||||
const VERBOSE = process.argv.includes('--verbose');
|
||||
|
||||
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
const failures = [];
|
||||
|
||||
function log(...args) {
|
||||
if (VERBOSE) console.log(...args);
|
||||
}
|
||||
|
||||
function pass(name) {
|
||||
passed++;
|
||||
console.log(` ✓ ${name}`);
|
||||
}
|
||||
|
||||
function fail(name, reason) {
|
||||
failed++;
|
||||
const msg = ` ✗ ${name}: ${reason}`;
|
||||
console.log(msg);
|
||||
failures.push({ name, reason });
|
||||
}
|
||||
|
||||
/**
|
||||
* fetch() that ignores TLS certificate errors (self-signed cert on localhost).
|
||||
*/
|
||||
async function get(path, { headers = {}, followRedirects = false } = {}) {
|
||||
const url = path.startsWith('http') ? path : `${BASE}${path}`;
|
||||
const res = await fetch(url, {
|
||||
redirect: followRedirects ? 'follow' : 'manual',
|
||||
headers,
|
||||
// Node 18/19 uses undici which respects NODE_TLS_REJECT_UNAUTHORIZED
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
async function post(path, body, { headers = {}, cookie = '' } = {}) {
|
||||
const url = path.startsWith('http') ? path : `${BASE}${path}`;
|
||||
const res = await fetch(url, {
|
||||
method: 'POST',
|
||||
redirect: 'manual',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(cookie ? { Cookie: cookie } : {}),
|
||||
...headers,
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
async function del(path, { cookie = '' } = {}) {
|
||||
const url = path.startsWith('http') ? path : `${BASE}${path}`;
|
||||
const res = await fetch(url, {
|
||||
method: 'DELETE',
|
||||
redirect: 'manual',
|
||||
headers: cookie ? { Cookie: cookie } : {},
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
/** Extract Set-Cookie header value(s) as a single cookie string. */
|
||||
function extractCookies(res) {
|
||||
const raw = res.headers.getSetCookie?.() ?? [];
|
||||
return raw.map((c) => c.split(';')[0]).join('; ');
|
||||
}
|
||||
|
||||
async function assert(name, fn) {
|
||||
try {
|
||||
await fn();
|
||||
pass(name);
|
||||
} catch (e) {
|
||||
fail(name, e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function expect(val, label) {
|
||||
return {
|
||||
toBe(expected) {
|
||||
if (val !== expected) throw new Error(`${label}: expected ${expected}, got ${val}`);
|
||||
},
|
||||
toBeOneOf(...options) {
|
||||
if (!options.includes(val)) throw new Error(`${label}: expected one of [${options.join(', ')}], got ${val}`);
|
||||
},
|
||||
toBeOk() {
|
||||
if (!val) throw new Error(`${label} was falsy`);
|
||||
},
|
||||
toContainKey(key) {
|
||||
if (!(key in val)) throw new Error(`${label}: missing key "${key}"`);
|
||||
},
|
||||
toBeArray() {
|
||||
if (!Array.isArray(val)) throw new Error(`${label}: expected array, got ${typeof val}`);
|
||||
},
|
||||
toBeAbove(n) {
|
||||
if (!(val > n)) throw new Error(`${label}: expected > ${n}, got ${val}`);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Test suite ───────────────────────────────────────────────────────────────
|
||||
|
||||
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
|
||||
|
||||
// Pick a known book slug from the database (first available)
|
||||
let TEST_SLUG = null;
|
||||
|
||||
console.log('\nLibNovel v3 — End-to-End Tests');
|
||||
console.log('================================\n');
|
||||
|
||||
// ── 1. Health checks ──────────────────────────────────────────────────────────
|
||||
console.log('1. Health checks');
|
||||
|
||||
await assert('UI health endpoint returns 200', async () => {
|
||||
const res = await get('/health');
|
||||
expect(res.status, 'status').toBe(200);
|
||||
});
|
||||
|
||||
await assert('Home page returns 200', async () => {
|
||||
const res = await get('/', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const html = await res.text();
|
||||
expect(html.includes('<html') || html.includes('<!DOCTYPE'), 'has html').toBe(true);
|
||||
});
|
||||
|
||||
await assert('Browse page returns 200', async () => {
|
||||
const res = await get('/browse', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
});
|
||||
|
||||
// ── 2. Home API ───────────────────────────────────────────────────────────────
|
||||
console.log('\n2. Home API');
|
||||
|
||||
let homeData = null;
|
||||
|
||||
await assert('GET /api/home returns continue_reading and recently_updated', async () => {
|
||||
const res = await get('/api/home', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
homeData = await res.json();
|
||||
expect(homeData, 'data').toContainKey('continue_reading');
|
||||
expect(homeData, 'data').toContainKey('recently_updated');
|
||||
expect(homeData.continue_reading, 'continue_reading').toBeArray();
|
||||
expect(homeData.recently_updated, 'recently_updated').toBeArray();
|
||||
});
|
||||
|
||||
await assert('GET /api/home stats has totalBooks and totalChapters', async () => {
|
||||
if (!homeData) {
|
||||
const res = await get('/api/home', { followRedirects: true });
|
||||
homeData = await res.json();
|
||||
}
|
||||
expect(homeData, 'data').toContainKey('stats');
|
||||
expect(typeof homeData.stats.totalBooks, 'totalBooks type').toBe('number');
|
||||
expect(typeof homeData.stats.totalChapters, 'totalChapters type').toBe('number');
|
||||
});
|
||||
|
||||
// ── 3. Browse / ranking / search ──────────────────────────────────────────────
|
||||
console.log('\n3. Browse / ranking / search');
|
||||
|
||||
await assert('GET /api/browse-page returns novels array', async () => {
|
||||
const res = await get('/api/browse-page?page=1', { followRedirects: true });
|
||||
expect(res.status, 'status').toBeOneOf(200, 503);
|
||||
if (res.status === 200) {
|
||||
const data = await res.json();
|
||||
// Accepts { novels: [...] } or { error: ... } (when MinIO cache is empty)
|
||||
expect(typeof data, 'response type').toBe('object');
|
||||
}
|
||||
});
|
||||
|
||||
await assert('GET /api/ranking returns array or 502 (no data yet)', async () => {
|
||||
const res = await get('/api/ranking', { followRedirects: true });
|
||||
// 200 = ranking data exists; 502 = no ranking data scraped yet — both are expected
|
||||
expect(res.status, 'status').toBeOneOf(200, 502);
|
||||
if (res.status === 200) {
|
||||
const data = await res.json();
|
||||
expect(data, 'data').toBeArray();
|
||||
}
|
||||
});
|
||||
|
||||
await assert('GET /api/search?q=shadow returns results object', async () => {
|
||||
const res = await get('/api/search?q=shadow', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
expect(typeof data, 'response type').toBe('object');
|
||||
});
|
||||
|
||||
// ── 4. Books ──────────────────────────────────────────────────────────────────
|
||||
console.log('\n4. Books');
|
||||
|
||||
// Find a real slug from /api/home
|
||||
await assert('GET /api/home books are accessible', async () => {
|
||||
if (!homeData) {
|
||||
const res = await get('/api/home', { followRedirects: true });
|
||||
homeData = await res.json();
|
||||
}
|
||||
const books = homeData?.recently_updated ?? [];
|
||||
if (books.length > 0) {
|
||||
TEST_SLUG = books[0].slug;
|
||||
log(` Using test slug: ${TEST_SLUG}`);
|
||||
}
|
||||
// Pass regardless — we just want to find a slug
|
||||
});
|
||||
|
||||
if (TEST_SLUG) {
|
||||
await assert(`GET /api/book/${TEST_SLUG} returns book metadata`, async () => {
|
||||
const res = await get(`/api/book/${TEST_SLUG}`, { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
// Returns { book: { slug, ... }, chapters: [...] }
|
||||
expect(data, 'data').toContainKey('book');
|
||||
expect(data.book, 'book').toContainKey('slug');
|
||||
expect(data.book.slug, 'slug').toBe(TEST_SLUG);
|
||||
});
|
||||
|
||||
await assert(`Book detail page /${TEST_SLUG} returns 200`, async () => {
|
||||
const res = await get(`/${TEST_SLUG}`, { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
});
|
||||
} else {
|
||||
console.log(' ⚠ No books in database — skipping book-specific tests');
|
||||
}
|
||||
|
||||
// ── 5. Voices ─────────────────────────────────────────────────────────────────
|
||||
console.log('\n5. Voices');
|
||||
|
||||
await assert('GET /api/voices returns voices array', async () => {
|
||||
const res = await get('/api/voices', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
// Returns { voices: [...] }
|
||||
expect(data, 'data').toContainKey('voices');
|
||||
expect(data.voices, 'voices').toBeArray();
|
||||
expect(data.voices.length, 'voice count').toBeAbove(0);
|
||||
});
|
||||
|
||||
// ── 6. Auth flow ──────────────────────────────────────────────────────────────
|
||||
console.log('\n6. Auth flow');
|
||||
|
||||
const TEST_USER = `e2e_test_${Date.now()}`;
|
||||
const TEST_PASS = 'E2eTestPassword1!';
|
||||
let authCookie = '';
|
||||
|
||||
await assert('POST /api/auth/register creates new user', async () => {
|
||||
const res = await post('/api/auth/register', { username: TEST_USER, password: TEST_PASS });
|
||||
expect(res.status, 'status').toBeOneOf(200, 201);
|
||||
const data = await res.json();
|
||||
// Returns { token: "...", user: { id, username, role } }
|
||||
expect(data, 'response').toContainKey('user');
|
||||
expect(data.user, 'user').toContainKey('username');
|
||||
expect(data.user.username, 'username').toBe(TEST_USER);
|
||||
// Build cookie from token
|
||||
if (data.token) {
|
||||
authCookie = `libnovel_auth=${data.token}`;
|
||||
} else {
|
||||
authCookie = extractCookies(res);
|
||||
}
|
||||
log(` Auth cookie: ${authCookie.slice(0, 40)}...`);
|
||||
});
|
||||
|
||||
await assert('GET /api/auth/me returns current user when logged in', async () => {
|
||||
const res = await get('/api/auth/me', { followRedirects: true, headers: { Cookie: authCookie } });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
expect(data, 'data').toContainKey('username');
|
||||
expect(data.username, 'username').toBe(TEST_USER);
|
||||
});
|
||||
|
||||
await assert('POST /api/auth/logout clears session', async () => {
|
||||
const res = await post('/api/auth/logout', {}, { cookie: authCookie });
|
||||
expect(res.status, 'status').toBeOneOf(200, 204);
|
||||
});
|
||||
|
||||
await assert('POST /api/auth/login works after register', async () => {
|
||||
const res = await post('/api/auth/login', { username: TEST_USER, password: TEST_PASS });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
// Returns { token: "...", user: { id, username, role } }
|
||||
expect(data, 'response').toContainKey('user');
|
||||
expect(data.user.username, 'username').toBe(TEST_USER);
|
||||
if (data.token) {
|
||||
authCookie = `libnovel_auth=${data.token}`;
|
||||
} else {
|
||||
authCookie = extractCookies(res);
|
||||
}
|
||||
});
|
||||
|
||||
await assert('GET /api/auth/me unauthenticated returns 401', async () => {
|
||||
const res = await get('/api/auth/me', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(401);
|
||||
});
|
||||
|
||||
// ── 7. Progress ───────────────────────────────────────────────────────────────
|
||||
console.log('\n7. Progress');
|
||||
|
||||
// /api/progress (root) is POST-only. Per-slug is GET/POST/DELETE via /api/progress/[slug].
|
||||
|
||||
if (TEST_SLUG) {
|
||||
await assert(`POST /api/progress/${TEST_SLUG} sets progress`, async () => {
|
||||
const res = await post(`/api/progress/${TEST_SLUG}`, { chapter: 1 });
|
||||
expect(res.status, 'status').toBeOneOf(200, 201);
|
||||
const data = await res.json();
|
||||
expect(data, 'data').toContainKey('ok');
|
||||
});
|
||||
|
||||
await assert(`DELETE /api/progress/${TEST_SLUG} removes progress`, async () => {
|
||||
const res = await del(`/api/progress/${TEST_SLUG}`);
|
||||
expect(res.status, 'status').toBeOneOf(200, 204);
|
||||
});
|
||||
} else {
|
||||
console.log(' ⚠ No books — skipping progress tests');
|
||||
}
|
||||
|
||||
// ── 8. Library ────────────────────────────────────────────────────────────────
|
||||
console.log('\n8. Library');
|
||||
|
||||
await assert('GET /api/library returns object', async () => {
|
||||
const res = await get('/api/library', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
expect(typeof data, 'data type').toBe('object');
|
||||
});
|
||||
|
||||
if (TEST_SLUG) {
|
||||
await assert(`POST /api/library/${TEST_SLUG} saves book`, async () => {
|
||||
const res = await post(`/api/library/${TEST_SLUG}`, {});
|
||||
expect(res.status, 'status').toBeOneOf(200, 201);
|
||||
});
|
||||
|
||||
await assert(`DELETE /api/library/${TEST_SLUG} removes book`, async () => {
|
||||
const res = await del(`/api/library/${TEST_SLUG}`);
|
||||
expect(res.status, 'status').toBeOneOf(200, 204);
|
||||
});
|
||||
}
|
||||
|
||||
// ── 9. Settings ───────────────────────────────────────────────────────────────
|
||||
console.log('\n9. Settings');
|
||||
|
||||
await assert('GET /api/settings returns settings object', async () => {
|
||||
const res = await get('/api/settings', { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
expect(typeof data, 'data type').toBe('object');
|
||||
});
|
||||
|
||||
// ── 10. Sessions ──────────────────────────────────────────────────────────────
|
||||
console.log('\n10. Sessions');
|
||||
|
||||
await assert('GET /api/sessions (authenticated) returns sessions array', async () => {
|
||||
const res = await get('/api/sessions', { followRedirects: true, headers: { Cookie: authCookie } });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
// Returns { sessions: [...] }
|
||||
expect(data, 'data').toContainKey('sessions');
|
||||
expect(data.sessions, 'sessions').toBeArray();
|
||||
});
|
||||
|
||||
// ── 11. Comments ──────────────────────────────────────────────────────────────
|
||||
console.log('\n11. Comments');
|
||||
|
||||
if (TEST_SLUG) {
|
||||
await assert(`GET /api/comments/${TEST_SLUG} returns comments`, async () => {
|
||||
const res = await get(`/api/comments/${TEST_SLUG}`, { followRedirects: true });
|
||||
expect(res.status, 'status').toBe(200);
|
||||
const data = await res.json();
|
||||
// Returns { comments: [...], myVotes: {}, avatarUrls: {} }
|
||||
expect(data, 'data').toContainKey('comments');
|
||||
expect(data.comments, 'comments').toBeArray();
|
||||
});
|
||||
}
|
||||
|
||||
// ── 12. Chapter endpoints ─────────────────────────────────────────────────────
|
||||
console.log('\n12. Chapter endpoints');
|
||||
|
||||
if (TEST_SLUG) {
|
||||
await assert(`GET /api/chapter/${TEST_SLUG}/1 returns 200 or 404`, async () => {
|
||||
const res = await get(`/api/chapter/${TEST_SLUG}/1`, { followRedirects: true });
|
||||
// 200 if chapter exists in MinIO, 404 if not
|
||||
expect(res.status, 'status').toBeOneOf(200, 404);
|
||||
});
|
||||
|
||||
await assert(`GET /api/chapter-text-preview/${TEST_SLUG}/1 returns 200 or error`, async () => {
|
||||
const res = await get(`/api/chapter-text-preview/${TEST_SLUG}/1`, { followRedirects: true });
|
||||
expect(res.status, 'status').toBeOneOf(200, 404, 500, 503);
|
||||
});
|
||||
}
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────────────────────────
|
||||
console.log('\n─────────────────────────────────────');
|
||||
console.log(`Results: ${passed} passed, ${failed} failed`);
|
||||
|
||||
if (failures.length > 0) {
|
||||
console.log('\nFailures:');
|
||||
for (const f of failures) {
|
||||
console.log(` ✗ ${f.name}`);
|
||||
console.log(` ${f.reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('─────────────────────────────────────\n');
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
250
scripts/pb-init-v3.sh
Executable file
250
scripts/pb-init-v3.sh
Executable file
@@ -0,0 +1,250 @@
|
||||
#!/bin/sh
|
||||
# pb-init-v3.sh — idempotent PocketBase bootstrap for the v3 stack.
|
||||
#
|
||||
# Safe to re-run: existing collections and fields are silently skipped.
|
||||
#
|
||||
# Env vars (defaults match docker-compose.yml):
|
||||
# POCKETBASE_URL http://pocketbase:8090
|
||||
# POCKETBASE_ADMIN_EMAIL admin@libnovel.local
|
||||
# POCKETBASE_ADMIN_PASSWORD changeme123
|
||||
|
||||
set -e
|
||||
|
||||
PB="${POCKETBASE_URL:-http://pocketbase:8090}"
|
||||
EMAIL="${POCKETBASE_ADMIN_EMAIL:-admin@libnovel.local}"
|
||||
PASS="${POCKETBASE_ADMIN_PASSWORD:-changeme123}"
|
||||
|
||||
log() { printf '[pb-init] %s\n' "$*"; }
|
||||
|
||||
# ── 0. Ensure dependencies ────────────────────────────────────────────────────
|
||||
command -v curl > /dev/null 2>&1 || apk add --no-cache curl > /dev/null 2>&1
|
||||
command -v python3 > /dev/null 2>&1 || apk add --no-cache python3 > /dev/null 2>&1
|
||||
|
||||
# ── 1. Wait for PocketBase ────────────────────────────────────────────────────
|
||||
log "waiting for PocketBase..."
|
||||
until curl -sf "$PB/api/health" > /dev/null 2>&1; do sleep 2; done
|
||||
log "PocketBase ready"
|
||||
|
||||
# ── 2. Bootstrap superuser (first-run only) ───────────────────────────────────
|
||||
LOCATION=$(curl -sf -o /dev/null -w "%{redirect_url}" "$PB/_/" 2>/dev/null || true)
|
||||
if echo "$LOCATION" | grep -q "pbinstal/"; then
|
||||
TOKEN=$(echo "$LOCATION" | sed 's|.*pbinstal/||' | tr -d ' \r\n')
|
||||
curl -sf -X POST "$PB/api/collections/_superusers/records" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-d "{\"email\":\"$EMAIL\",\"password\":\"$PASS\",\"passwordConfirm\":\"$PASS\"}" \
|
||||
> /dev/null 2>&1 || true
|
||||
log "superuser created"
|
||||
fi
|
||||
|
||||
# ── 3. Authenticate ───────────────────────────────────────────────────────────
|
||||
AUTH=$(curl -sf -X POST "$PB/api/collections/_superusers/auth-with-password" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"identity\":\"$EMAIL\",\"password\":\"$PASS\"}")
|
||||
TOK=$(echo "$AUTH" | sed 's/.*"token":"\([^"]*\)".*/\1/')
|
||||
[ -z "$TOK" ] || [ "$TOK" = "$AUTH" ] && { log "ERROR: auth failed"; exit 1; }
|
||||
log "authenticated"
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# create NAME BODY — POST collection; 400/422 = already exists, treated as ok.
|
||||
create() {
|
||||
NAME="$1"; BODY="$2"
|
||||
STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-X POST "$PB/api/collections" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $TOK" \
|
||||
-d "$BODY")
|
||||
case "$STATUS" in
|
||||
200|201) log "created: $NAME" ;;
|
||||
400|422) log "exists (skip): $NAME" ;;
|
||||
*) log "WARNING: $NAME returned $STATUS" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# add_field COLLECTION FIELD_NAME FIELD_TYPE
|
||||
# Fetches current schema, appends field if absent, PATCHes collection.
|
||||
# Requires python3 for safe JSON manipulation.
|
||||
add_field() {
|
||||
COLL="$1"; FIELD="$2"; TYPE="$3"
|
||||
SCHEMA=$(curl -sf -H "Authorization: Bearer $TOK" "$PB/api/collections/$COLL" 2>/dev/null)
|
||||
# Check existence and extract collection id + fields via python3
|
||||
PARSED=$(echo "$SCHEMA" | python3 -c "
|
||||
import sys, json
|
||||
d = json.load(sys.stdin)
|
||||
fields = d.get('fields', [])
|
||||
exists = any(f.get('name') == '$FIELD' for f in fields)
|
||||
print('exists=' + str(exists))
|
||||
print('id=' + d.get('id', ''))
|
||||
if not exists:
|
||||
fields.append({'name': '$FIELD', 'type': '$TYPE'})
|
||||
print('fields=' + json.dumps(fields))
|
||||
" 2>/dev/null)
|
||||
if echo "$PARSED" | grep -q "^exists=True"; then
|
||||
log "field exists (skip): $COLL.$FIELD"; return
|
||||
fi
|
||||
COLL_ID=$(echo "$PARSED" | grep "^id=" | sed 's/^id=//')
|
||||
[ -z "$COLL_ID" ] && { log "WARNING: cannot resolve id for $COLL"; return; }
|
||||
NEW_FIELDS=$(echo "$PARSED" | grep "^fields=" | sed 's/^fields=//')
|
||||
STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-X PATCH "$PB/api/collections/$COLL_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $TOK" \
|
||||
-d "{\"fields\":${NEW_FIELDS}}")
|
||||
case "$STATUS" in
|
||||
200|201) log "added field: $COLL.$FIELD ($TYPE)" ;;
|
||||
*) log "WARNING: add_field $COLL.$FIELD returned $STATUS" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ── 4. Collections ────────────────────────────────────────────────────────────
|
||||
|
||||
create "books" '{
|
||||
"name":"books","type":"base","fields":[
|
||||
{"name":"slug", "type":"text", "required":true},
|
||||
{"name":"title", "type":"text", "required":true},
|
||||
{"name":"author", "type":"text"},
|
||||
{"name":"cover", "type":"text"},
|
||||
{"name":"status", "type":"text"},
|
||||
{"name":"genres", "type":"json"},
|
||||
{"name":"summary", "type":"text"},
|
||||
{"name":"total_chapters","type":"number"},
|
||||
{"name":"source_url", "type":"text"},
|
||||
{"name":"ranking", "type":"number"},
|
||||
{"name":"meta_updated", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "chapters_idx" '{
|
||||
"name":"chapters_idx","type":"base","fields":[
|
||||
{"name":"slug", "type":"text", "required":true},
|
||||
{"name":"number","type":"number", "required":true},
|
||||
{"name":"title", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "ranking" '{
|
||||
"name":"ranking","type":"base","fields":[
|
||||
{"name":"rank", "type":"number","required":true},
|
||||
{"name":"slug", "type":"text", "required":true},
|
||||
{"name":"title", "type":"text"},
|
||||
{"name":"author", "type":"text"},
|
||||
{"name":"cover", "type":"text"},
|
||||
{"name":"status", "type":"text"},
|
||||
{"name":"genres", "type":"json"},
|
||||
{"name":"source_url","type":"text"}
|
||||
]}'
|
||||
|
||||
create "progress" '{
|
||||
"name":"progress","type":"base","fields":[
|
||||
{"name":"session_id","type":"text", "required":true},
|
||||
{"name":"slug", "type":"text", "required":true},
|
||||
{"name":"chapter", "type":"number"},
|
||||
{"name":"user_id", "type":"text"},
|
||||
{"name":"audio_time","type":"number"},
|
||||
{"name":"updated", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "scraping_tasks" '{
|
||||
"name":"scraping_tasks","type":"base","fields":[
|
||||
{"name":"kind", "type":"text"},
|
||||
{"name":"target_url", "type":"text"},
|
||||
{"name":"from_chapter", "type":"number"},
|
||||
{"name":"to_chapter", "type":"number"},
|
||||
{"name":"worker_id", "type":"text"},
|
||||
{"name":"status", "type":"text","required":true},
|
||||
{"name":"books_found", "type":"number"},
|
||||
{"name":"chapters_scraped", "type":"number"},
|
||||
{"name":"chapters_skipped", "type":"number"},
|
||||
{"name":"errors", "type":"number"},
|
||||
{"name":"error_message", "type":"text"},
|
||||
{"name":"started", "type":"date"},
|
||||
{"name":"finished", "type":"date"},
|
||||
{"name":"heartbeat_at", "type":"date"}
|
||||
]}'
|
||||
|
||||
create "audio_jobs" '{
|
||||
"name":"audio_jobs","type":"base","fields":[
|
||||
{"name":"cache_key", "type":"text", "required":true},
|
||||
{"name":"slug", "type":"text", "required":true},
|
||||
{"name":"chapter", "type":"number","required":true},
|
||||
{"name":"voice", "type":"text"},
|
||||
{"name":"worker_id", "type":"text"},
|
||||
{"name":"status", "type":"text", "required":true},
|
||||
{"name":"error_message","type":"text"},
|
||||
{"name":"started", "type":"date"},
|
||||
{"name":"finished", "type":"date"},
|
||||
{"name":"heartbeat_at", "type":"date"}
|
||||
]}'
|
||||
|
||||
create "app_users" '{
|
||||
"name":"app_users","type":"base","fields":[
|
||||
{"name":"username", "type":"text","required":true},
|
||||
{"name":"password_hash","type":"text"},
|
||||
{"name":"role", "type":"text"},
|
||||
{"name":"avatar_url", "type":"text"},
|
||||
{"name":"created", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "user_sessions" '{
|
||||
"name":"user_sessions","type":"base","fields":[
|
||||
{"name":"user_id", "type":"text","required":true},
|
||||
{"name":"session_id","type":"text","required":true},
|
||||
{"name":"user_agent","type":"text"},
|
||||
{"name":"ip", "type":"text"},
|
||||
{"name":"created_at","type":"text"},
|
||||
{"name":"last_seen", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "user_library" '{
|
||||
"name":"user_library","type":"base","fields":[
|
||||
{"name":"session_id","type":"text","required":true},
|
||||
{"name":"user_id", "type":"text"},
|
||||
{"name":"slug", "type":"text","required":true},
|
||||
{"name":"saved_at", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "user_settings" '{
|
||||
"name":"user_settings","type":"base","fields":[
|
||||
{"name":"session_id","type":"text","required":true},
|
||||
{"name":"user_id", "type":"text"},
|
||||
{"name":"auto_next","type":"bool"},
|
||||
{"name":"voice", "type":"text"},
|
||||
{"name":"speed", "type":"number"},
|
||||
{"name":"updated", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "user_subscriptions" '{
|
||||
"name":"user_subscriptions","type":"base","fields":[
|
||||
{"name":"follower_id","type":"text","required":true},
|
||||
{"name":"followee_id","type":"text","required":true},
|
||||
{"name":"created", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "book_comments" '{
|
||||
"name":"book_comments","type":"base","fields":[
|
||||
{"name":"slug", "type":"text","required":true},
|
||||
{"name":"user_id", "type":"text"},
|
||||
{"name":"username", "type":"text"},
|
||||
{"name":"body", "type":"text"},
|
||||
{"name":"upvotes", "type":"number"},
|
||||
{"name":"downvotes","type":"number"},
|
||||
{"name":"parent_id","type":"text"},
|
||||
{"name":"created", "type":"text"}
|
||||
]}'
|
||||
|
||||
create "comment_votes" '{
|
||||
"name":"comment_votes","type":"base","fields":[
|
||||
{"name":"comment_id","type":"text","required":true},
|
||||
{"name":"user_id", "type":"text"},
|
||||
{"name":"session_id","type":"text"},
|
||||
{"name":"vote", "type":"text"}
|
||||
]}'
|
||||
|
||||
# ── 5. Field migrations (idempotent — adds fields missing from older installs) ─
|
||||
add_field "scraping_tasks" "heartbeat_at" "date"
|
||||
add_field "audio_jobs" "heartbeat_at" "date"
|
||||
add_field "progress" "user_id" "text"
|
||||
add_field "progress" "audio_time" "number"
|
||||
add_field "progress" "updated" "text"
|
||||
add_field "books" "meta_updated" "text"
|
||||
|
||||
log "done"
|
||||
5
ui/.dockerignore
Normal file
5
ui/.dockerignore
Normal file
@@ -0,0 +1,5 @@
|
||||
node_modules
|
||||
build
|
||||
.svelte-kit
|
||||
.env
|
||||
.env.*
|
||||
20
ui/.env.example
Normal file
20
ui/.env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# libnovel UI — environment variables
|
||||
# Copy to .env and adjust; do NOT commit with real secrets.
|
||||
|
||||
# Internal URL of the backend API (used by SvelteKit server-side load functions)
|
||||
# In docker-compose this is the internal service name
|
||||
BACKEND_API_URL=http://localhost:8080
|
||||
|
||||
# Public URL of PocketBase (used by SvelteKit server-side load functions)
|
||||
POCKETBASE_URL=http://localhost:8090
|
||||
|
||||
# PocketBase admin credentials (server-side only, never exposed to browser)
|
||||
POCKETBASE_ADMIN_EMAIL=admin@libnovel.local
|
||||
POCKETBASE_ADMIN_PASSWORD=changeme123
|
||||
|
||||
# Public-facing MinIO URL (used to rewrite presigned URLs for the browser)
|
||||
# In dev this is localhost; in prod set to your MinIO public domain
|
||||
PUBLIC_MINIO_PUBLIC_URL=http://localhost:9000
|
||||
|
||||
# Secret used to sign auth tokens stored in cookies (generate with: openssl rand -hex 32)
|
||||
AUTH_SECRET=change_this_to_a_long_random_secret
|
||||
23
ui/.gitignore
vendored
Normal file
23
ui/.gitignore
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
node_modules
|
||||
|
||||
# Output
|
||||
.output
|
||||
.vercel
|
||||
.netlify
|
||||
.wrangler
|
||||
/.svelte-kit
|
||||
/build
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Env
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
!.env.test
|
||||
|
||||
# Vite
|
||||
vite.config.js.timestamp-*
|
||||
vite.config.ts.timestamp-*
|
||||
71
ui/AGENTS.md
Normal file
71
ui/AGENTS.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# LibNovel UI — Agent Context
|
||||
|
||||
SvelteKit 2 + Svelte 5 frontend. Node adapter for production; served behind Caddy.
|
||||
|
||||
## Design System
|
||||
|
||||
**ACTIVE_STYLE: branded**
|
||||
|
||||
Custom amber + zinc dark palette. No shadcn CLI defaults — primitives are hand-authored to match.
|
||||
|
||||
| Token | Value |
|
||||
|-------|-------|
|
||||
| Accent | `#f59e0b` (amber-400) |
|
||||
| Surface-1 | zinc-900 |
|
||||
| Surface-2 | zinc-800 |
|
||||
| Surface-3 | zinc-700 |
|
||||
| Text-primary | zinc-100 |
|
||||
| Text-secondary | zinc-400 |
|
||||
| Destructive | red-400 |
|
||||
|
||||
## Tailwind
|
||||
|
||||
**Version: 4** — configured entirely via `@theme {}` in `src/app.css` and the `@tailwindcss/vite` plugin.
|
||||
|
||||
**There is no `tailwind.config.ts`** — do not create one.
|
||||
**Do not run `npx shadcn-svelte add ...`** — primitives are hand-authored in `$lib/components/ui/`.
|
||||
|
||||
## shadcn-svelte Primitives
|
||||
|
||||
All in `src/lib/components/ui/`:
|
||||
|
||||
| Component | Variants / Notes |
|
||||
|-----------|-----------------|
|
||||
| `button` | default / secondary / outline / ghost / destructive / link; sizes: default / sm / lg / icon |
|
||||
| `badge` | default / secondary / outline / destructive |
|
||||
| `card` | Card, CardHeader, CardTitle, CardDescription, CardContent, CardFooter |
|
||||
| `textarea` | bindable `value` prop |
|
||||
| `dialog` | Dialog, DialogContent, DialogHeader, DialogTitle, DialogFooter |
|
||||
| `separator` | horizontal / vertical |
|
||||
|
||||
Always use `cn()` from `$lib/utils` — never template-literal class conditionals.
|
||||
|
||||
## Svelte 5 Conventions
|
||||
|
||||
- `@Observable` / runes (`$state`, `$derived`, `$effect`, `$props()`) for all new code.
|
||||
- Do not add `ObservableObject` / `@Published` — they don't exist in Svelte; don't introduce legacy `writable` stores for new code.
|
||||
- Navigation: `goto()` from `$app/navigation`; `page` from `$app/state`.
|
||||
|
||||
## iOS/UX Skill
|
||||
|
||||
For any view work, load the `ios-ux` skill at task start:
|
||||
```
|
||||
skill({ name: "ios-ux" })
|
||||
```
|
||||
|
||||
## Key Files
|
||||
|
||||
| File | Role |
|
||||
|------|------|
|
||||
| `src/app.css` | Tailwind v4 `@theme` tokens — source of truth for all design tokens |
|
||||
| `src/lib/utils.ts` | `cn()` helper (clsx + tailwind-merge) |
|
||||
| `src/lib/types.ts` | Shared client-safe domain types |
|
||||
| `src/routes/+layout.svelte` | Root layout: sticky nav, persistent `<audio>` element, mini-player bar |
|
||||
| `src/lib/audio.svelte.ts` | Global audio store (Svelte 5 runes class) |
|
||||
| `src/lib/components/ui/` | shadcn-style UI primitives |
|
||||
|
||||
## Persistent Audio Element
|
||||
|
||||
The `<audio>` element in `+layout.svelte` **must never be conditionally rendered** (no `{#if}`).
|
||||
Conditional rendering destroys and recreates the element, triggering `onpause` mid-playback.
|
||||
Keep it always in the DOM; control it via `audioStore`.
|
||||
44
ui/Dockerfile
Normal file
44
ui/Dockerfile
Normal file
@@ -0,0 +1,44 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM node:22-alpine AS builder
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies in a separate layer so it is cached as long as
|
||||
# package-lock.json does not change. The npm cache mount persists the
|
||||
# ~/.npm cache across builds so packages are not re-downloaded.
|
||||
COPY package.json package-lock.json ./
|
||||
RUN --mount=type=cache,target=/root/.npm \
|
||||
npm ci
|
||||
|
||||
COPY . .
|
||||
|
||||
# Build-time version info — injected by docker-compose or CI via --build-arg.
|
||||
ARG BUILD_VERSION=dev
|
||||
ARG BUILD_COMMIT=unknown
|
||||
|
||||
# Expose as PUBLIC_ env vars so SvelteKit's $env/dynamic/public can read them.
|
||||
ENV PUBLIC_BUILD_VERSION=$BUILD_VERSION
|
||||
ENV PUBLIC_BUILD_COMMIT=$BUILD_COMMIT
|
||||
|
||||
RUN npm run build
|
||||
|
||||
# ── Runtime image ──────────────────────────────────────────────────────────────
|
||||
# adapter-node bundles most server-side code, but packages with dynamic
|
||||
# requires or native bindings (e.g. ioredis) are not inlined by Rollup and
|
||||
# must be present in node_modules at runtime. We install only production
|
||||
# deps (no devDependencies) to keep the image small.
|
||||
FROM node:22-alpine
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=builder /app/build ./build
|
||||
COPY --from=builder /app/package.json ./package.json
|
||||
COPY --from=builder /app/package-lock.json ./package-lock.json
|
||||
|
||||
RUN --mount=type=cache,target=/root/.npm \
|
||||
npm ci --omit=dev
|
||||
|
||||
ENV NODE_ENV=production
|
||||
ENV PORT=3000
|
||||
ENV HOST=0.0.0.0
|
||||
|
||||
EXPOSE $PORT
|
||||
CMD ["node", "build"]
|
||||
42
ui/README.md
Normal file
42
ui/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# sv
|
||||
|
||||
Everything you need to build a Svelte project, powered by [`sv`](https://github.com/sveltejs/cli).
|
||||
|
||||
## Creating a project
|
||||
|
||||
If you're seeing this, you've probably already done this step. Congrats!
|
||||
|
||||
```sh
|
||||
# create a new project
|
||||
npx sv create my-app
|
||||
```
|
||||
|
||||
To recreate this project with the same configuration:
|
||||
|
||||
```sh
|
||||
# recreate this project
|
||||
npx sv@0.12.4 create --template minimal --types ts --install npm ui
|
||||
```
|
||||
|
||||
## Developing
|
||||
|
||||
Once you've created a project and installed dependencies with `npm install` (or `pnpm install` or `yarn`), start a development server:
|
||||
|
||||
```sh
|
||||
npm run dev
|
||||
|
||||
# or start the server and open the app in a new browser tab
|
||||
npm run dev -- --open
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
To create a production version of your app:
|
||||
|
||||
```sh
|
||||
npm run build
|
||||
```
|
||||
|
||||
You can preview the production build with `npm run preview`.
|
||||
|
||||
> To deploy your app, you may need to install an [adapter](https://svelte.dev/docs/kit/adapters) for your target environment.
|
||||
17
ui/components.json
Normal file
17
ui/components.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"$schema": "https://shadcn-svelte.com/schema.json",
|
||||
"style": "default",
|
||||
"tailwind": {
|
||||
"config": "",
|
||||
"css": "src/app.css",
|
||||
"baseColor": "zinc",
|
||||
"cssVariables": false
|
||||
},
|
||||
"aliases": {
|
||||
"components": "$lib/components",
|
||||
"utils": "$lib/utils",
|
||||
"ui": "$lib/components/ui",
|
||||
"hooks": "$lib/hooks"
|
||||
},
|
||||
"typescript": true
|
||||
}
|
||||
6301
ui/package-lock.json
generated
Normal file
6301
ui/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user