Files
libnovel/homelab/otel/grafana/provisioning/alerting/rules.yaml
Admin fe1a933fd0
Some checks failed
CI / Docker / caddy (pull_request) Failing after 50s
CI / Check ui (pull_request) Successful in 1m5s
Release / Check ui (push) Successful in 45s
Release / Test backend (push) Successful in 1m29s
CI / Docker / ui (pull_request) Successful in 1m28s
Release / Docker / backend (push) Successful in 3m14s
CI / Test backend (pull_request) Failing after 42s
CI / Docker / backend (pull_request) Has been skipped
CI / Docker / runner (pull_request) Has been skipped
Release / Docker / caddy (push) Successful in 6m48s
Release / Docker / ui (push) Successful in 2m8s
Release / Docker / runner (push) Successful in 2m51s
Release / Upload source maps (push) Failing after 53s
Release / Gitea Release (push) Has been skipped
feat(queue): replace PocketBase polling with Asynq + Redis
Introduce a Redis-backed Asynq task queue so the runner consumes TTS
jobs pushed by the backend instead of polling PocketBase.

- backend/internal/asynqqueue: Producer and Consumer wrappers
- backend/internal/runner: AsynqRunner mux, per-instance Prometheus
  registry (fixes duplicate-collector panic in tests), redisConnOpt
- backend/internal/config: REDIS_ADDR / REDIS_PASSWORD env vars
- backend/cmd/{backend,runner}/main.go: wire Redis when env set; fall
  back to legacy poll mode when unset
- Caddyfile: caddy-l4 TCP proxy for redis.libnovel.cc:6380 → homelab
- caddy/Dockerfile: add --with github.com/mholt/caddy-l4
- docker-compose.yml: Caddy exposes 6380, backend/runner get Redis env
- homelab/runner/docker-compose.yml: Redis sidecar, runner depends_on
- homelab/otel/grafana: Grafana dashboards (backend, catalogue, runner)
  and alerting rules / contact-points provisioning
2026-03-28 14:32:40 +05:00

215 lines
7.5 KiB
YAML

# Grafana alerting provisioning — alert rules
# Covers: runner down, high task failure rate, audio error spike, backend error spike.
apiVersion: 1
groups:
- orgId: 1
name: LibNovel Runner
folder: LibNovel
interval: 1m
rules:
- uid: runner-down
title: Runner Down
condition: C
for: 2m
annotations:
summary: "LibNovel runner is not reachable"
description: "The Prometheus scrape of runner:9091 has been failing for >2 minutes. Tasks are not being processed."
labels:
severity: critical
service: runner
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange: { from: 300, to: 0 }
model:
expr: "up{job=\"libnovel-runner\"}"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 300, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [1], type: lt }
operator: { type: and }
query: { params: [A] }
reducer: { params: [], type: last }
- uid: runner-high-failure-rate
title: Runner High Task Failure Rate
condition: C
for: 5m
annotations:
summary: "Runner task failure rate is above 20%"
description: "More than 20% of runner tasks have been failing for the last 5 minutes. Check runner logs."
labels:
severity: warning
service: runner
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange: { from: 600, to: 0 }
model:
expr: "rate(libnovel_runner_tasks_failed_total[5m]) / clamp_min(rate(libnovel_runner_tasks_completed_total[5m]) + rate(libnovel_runner_tasks_failed_total[5m]), 0.001)"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 600, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [0.2], type: gt }
operator: { type: and }
query: { params: [A] }
reducer: { params: [], type: last }
- uid: runner-tasks-stalled
title: Runner Tasks Stalled
condition: C
for: 10m
annotations:
summary: "Runner has tasks running for >10 minutes with no completions"
description: "tasks_running > 0 but rate(tasks_completed) is 0. Tasks may be stuck or the runner is in a crash loop."
labels:
severity: warning
service: runner
data:
- refId: Running
datasourceUid: prometheus
relativeTimeRange: { from: 900, to: 0 }
model:
expr: "libnovel_runner_tasks_running"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: Rate
datasourceUid: prometheus
relativeTimeRange: { from: 900, to: 0 }
model:
expr: "rate(libnovel_runner_tasks_completed_total[10m])"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 900, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [0], type: gt }
operator: { type: and }
query: { params: [Running] }
reducer: { params: [], type: last }
- evaluator: { params: [0.001], type: lt }
operator: { type: and }
query: { params: [Rate] }
reducer: { params: [], type: last }
- orgId: 1
name: LibNovel Backend
folder: LibNovel
interval: 1m
rules:
- uid: backend-high-error-rate
title: Backend High Error Rate
condition: C
for: 5m
annotations:
summary: "Backend API error rate above 5%"
description: "More than 5% of backend HTTP requests are returning 5xx status codes (as seen from UI OTel instrumentation)."
labels:
severity: warning
service: backend
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange: { from: 600, to: 0 }
model:
expr: "sum(rate(http_client_request_duration_seconds_count{job=\"ui\", server_address=\"backend\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_client_request_duration_seconds_count{job=\"ui\", server_address=\"backend\"}[5m])), 0.001)"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 600, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [0.05], type: gt }
operator: { type: and }
query: { params: [A] }
reducer: { params: [], type: last }
- uid: backend-high-p95-latency
title: Backend High p95 Latency
condition: C
for: 5m
annotations:
summary: "Backend p95 latency above 2s"
description: "95th percentile latency of backend spans has exceeded 2 seconds for >5 minutes."
labels:
severity: warning
service: backend
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange: { from: 600, to: 0 }
model:
expr: "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service=\"backend\"}[5m])) by (le))"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 600, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [2], type: gt }
operator: { type: and }
query: { params: [A] }
reducer: { params: [], type: last }
- orgId: 1
name: LibNovel OTel Pipeline
folder: LibNovel
interval: 2m
rules:
- uid: otel-collector-down
title: OTel Collector Down
condition: C
for: 3m
annotations:
summary: "OTel collector is not reachable"
description: "Prometheus cannot scrape otel-collector:8888. Traces and logs may be dropping."
labels:
severity: warning
service: otel-collector
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange: { from: 600, to: 0 }
model:
expr: "up{job=\"otel-collector\"}"
instant: true
intervalMs: 1000
maxDataPoints: 43200
- refId: C
datasourceUid: __expr__
relativeTimeRange: { from: 600, to: 0 }
model:
type: classic_conditions
conditions:
- evaluator: { params: [1], type: lt }
operator: { type: and }
query: { params: [A] }
reducer: { params: [], type: last }