Some checks failed
CI / Docker / caddy (pull_request) Failing after 50s
CI / Check ui (pull_request) Successful in 1m5s
Release / Check ui (push) Successful in 45s
Release / Test backend (push) Successful in 1m29s
CI / Docker / ui (pull_request) Successful in 1m28s
Release / Docker / backend (push) Successful in 3m14s
CI / Test backend (pull_request) Failing after 42s
CI / Docker / backend (pull_request) Has been skipped
CI / Docker / runner (pull_request) Has been skipped
Release / Docker / caddy (push) Successful in 6m48s
Release / Docker / ui (push) Successful in 2m8s
Release / Docker / runner (push) Successful in 2m51s
Release / Upload source maps (push) Failing after 53s
Release / Gitea Release (push) Has been skipped
Introduce a Redis-backed Asynq task queue so the runner consumes TTS
jobs pushed by the backend instead of polling PocketBase.
- backend/internal/asynqqueue: Producer and Consumer wrappers
- backend/internal/runner: AsynqRunner mux, per-instance Prometheus
registry (fixes duplicate-collector panic in tests), redisConnOpt
- backend/internal/config: REDIS_ADDR / REDIS_PASSWORD env vars
- backend/cmd/{backend,runner}/main.go: wire Redis when env set; fall
back to legacy poll mode when unset
- Caddyfile: caddy-l4 TCP proxy for redis.libnovel.cc:6380 → homelab
- caddy/Dockerfile: add --with github.com/mholt/caddy-l4
- docker-compose.yml: Caddy exposes 6380, backend/runner get Redis env
- homelab/runner/docker-compose.yml: Redis sidecar, runner depends_on
- homelab/otel/grafana: Grafana dashboards (backend, catalogue, runner)
and alerting rules / contact-points provisioning
215 lines
7.5 KiB
YAML
215 lines
7.5 KiB
YAML
# Grafana alerting provisioning — alert rules
|
|
# Covers: runner down, high task failure rate, audio error spike, backend error spike.
|
|
apiVersion: 1
|
|
|
|
groups:
|
|
- orgId: 1
|
|
name: LibNovel Runner
|
|
folder: LibNovel
|
|
interval: 1m
|
|
rules:
|
|
|
|
- uid: runner-down
|
|
title: Runner Down
|
|
condition: C
|
|
for: 2m
|
|
annotations:
|
|
summary: "LibNovel runner is not reachable"
|
|
description: "The Prometheus scrape of runner:9091 has been failing for >2 minutes. Tasks are not being processed."
|
|
labels:
|
|
severity: critical
|
|
service: runner
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
model:
|
|
expr: "up{job=\"libnovel-runner\"}"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [1], type: lt }
|
|
operator: { type: and }
|
|
query: { params: [A] }
|
|
reducer: { params: [], type: last }
|
|
|
|
- uid: runner-high-failure-rate
|
|
title: Runner High Task Failure Rate
|
|
condition: C
|
|
for: 5m
|
|
annotations:
|
|
summary: "Runner task failure rate is above 20%"
|
|
description: "More than 20% of runner tasks have been failing for the last 5 minutes. Check runner logs."
|
|
labels:
|
|
severity: warning
|
|
service: runner
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
expr: "rate(libnovel_runner_tasks_failed_total[5m]) / clamp_min(rate(libnovel_runner_tasks_completed_total[5m]) + rate(libnovel_runner_tasks_failed_total[5m]), 0.001)"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [0.2], type: gt }
|
|
operator: { type: and }
|
|
query: { params: [A] }
|
|
reducer: { params: [], type: last }
|
|
|
|
- uid: runner-tasks-stalled
|
|
title: Runner Tasks Stalled
|
|
condition: C
|
|
for: 10m
|
|
annotations:
|
|
summary: "Runner has tasks running for >10 minutes with no completions"
|
|
description: "tasks_running > 0 but rate(tasks_completed) is 0. Tasks may be stuck or the runner is in a crash loop."
|
|
labels:
|
|
severity: warning
|
|
service: runner
|
|
data:
|
|
- refId: Running
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 900, to: 0 }
|
|
model:
|
|
expr: "libnovel_runner_tasks_running"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: Rate
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 900, to: 0 }
|
|
model:
|
|
expr: "rate(libnovel_runner_tasks_completed_total[10m])"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 900, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [0], type: gt }
|
|
operator: { type: and }
|
|
query: { params: [Running] }
|
|
reducer: { params: [], type: last }
|
|
- evaluator: { params: [0.001], type: lt }
|
|
operator: { type: and }
|
|
query: { params: [Rate] }
|
|
reducer: { params: [], type: last }
|
|
|
|
- orgId: 1
|
|
name: LibNovel Backend
|
|
folder: LibNovel
|
|
interval: 1m
|
|
rules:
|
|
|
|
- uid: backend-high-error-rate
|
|
title: Backend High Error Rate
|
|
condition: C
|
|
for: 5m
|
|
annotations:
|
|
summary: "Backend API error rate above 5%"
|
|
description: "More than 5% of backend HTTP requests are returning 5xx status codes (as seen from UI OTel instrumentation)."
|
|
labels:
|
|
severity: warning
|
|
service: backend
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
expr: "sum(rate(http_client_request_duration_seconds_count{job=\"ui\", server_address=\"backend\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_client_request_duration_seconds_count{job=\"ui\", server_address=\"backend\"}[5m])), 0.001)"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [0.05], type: gt }
|
|
operator: { type: and }
|
|
query: { params: [A] }
|
|
reducer: { params: [], type: last }
|
|
|
|
- uid: backend-high-p95-latency
|
|
title: Backend High p95 Latency
|
|
condition: C
|
|
for: 5m
|
|
annotations:
|
|
summary: "Backend p95 latency above 2s"
|
|
description: "95th percentile latency of backend spans has exceeded 2 seconds for >5 minutes."
|
|
labels:
|
|
severity: warning
|
|
service: backend
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
expr: "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service=\"backend\"}[5m])) by (le))"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [2], type: gt }
|
|
operator: { type: and }
|
|
query: { params: [A] }
|
|
reducer: { params: [], type: last }
|
|
|
|
- orgId: 1
|
|
name: LibNovel OTel Pipeline
|
|
folder: LibNovel
|
|
interval: 2m
|
|
rules:
|
|
|
|
- uid: otel-collector-down
|
|
title: OTel Collector Down
|
|
condition: C
|
|
for: 3m
|
|
annotations:
|
|
summary: "OTel collector is not reachable"
|
|
description: "Prometheus cannot scrape otel-collector:8888. Traces and logs may be dropping."
|
|
labels:
|
|
severity: warning
|
|
service: otel-collector
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
expr: "up{job=\"otel-collector\"}"
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange: { from: 600, to: 0 }
|
|
model:
|
|
type: classic_conditions
|
|
conditions:
|
|
- evaluator: { params: [1], type: lt }
|
|
operator: { type: and }
|
|
query: { params: [A] }
|
|
reducer: { params: [], type: last }
|