libnovel/homelab/otel/grafana/provisioning/dashboards/runner.json

{
  "uid": "libnovel-runner",
  "title": "Runner Operations",
  "description": "Task queue health, throughput, TTS routing, and live logs for the homelab runner.",
  "tags": ["libnovel", "runner"],
  "timezone": "browser",
  "refresh": "30s",
  "time": { "from": "now-3h", "to": "now" },
  "schemaVersion": 39,
  "panels": [
    {
      "id": 1,
      "type": "stat",
      "title": "Tasks Running",
      "gridPos": { "x": 0, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "background",
        "graphMode": "none",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 3 }
            ]
          },
          "mappings": []
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_tasks_running",
          "legendFormat": "running",
          "instant": true
        }
      ]
    },
    {
      "id": 2,
      "type": "stat",
      "title": "Tasks Completed (total)",
      "gridPos": { "x": 4, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "background",
        "graphMode": "area",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "fixedColor": "green", "mode": "fixed" },
          "thresholds": { "mode": "absolute", "steps": [] }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_tasks_completed_total",
          "legendFormat": "completed",
          "instant": true
        }
      ]
    },
    {
      "id": 3,
      "type": "stat",
      "title": "Tasks Failed (total)",
      "gridPos": { "x": 8, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "background",
        "graphMode": "none",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 1 },
              { "color": "red", "value": 5 }
            ]
          }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_tasks_failed_total",
          "legendFormat": "failed",
          "instant": true
        }
      ]
    },
    {
      "id": 4,
      "type": "stat",
      "title": "Runner Uptime",
      "gridPos": { "x": 12, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "value",
        "graphMode": "none",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "s",
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "red", "value": null },
              { "color": "yellow", "value": 60 },
              { "color": "green", "value": 300 }
            ]
          }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_uptime_seconds",
          "legendFormat": "uptime",
          "instant": true
        }
      ]
    },
    {
      "id": 5,
      "type": "stat",
      "title": "Task Failure Rate",
      "gridPos": { "x": 16, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "background",
        "graphMode": "none",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percentunit",
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 0.05 },
              { "color": "red", "value": 0.2 }
            ]
          }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_tasks_failed_total / clamp_min(runner_tasks_completed_total + runner_tasks_failed_total, 1)",
          "legendFormat": "failure rate",
          "instant": true
        }
      ]
    },
    {
      "id": 6,
      "type": "stat",
      "title": "Runner Alive",
      "gridPos": { "x": 20, "y": 0, "w": 4, "h": 4 },
      "options": {
        "reduceOptions": { "calcs": ["lastNotNull"] },
        "colorMode": "background",
        "graphMode": "none",
        "textMode": "auto"
      },
      "fieldConfig": {
        "defaults": {
          "mappings": [
            { "type": "value", "options": { "1": { "text": "UP", "color": "green" }, "0": { "text": "DOWN", "color": "red" } } }
          ],
          "thresholds": { "mode": "absolute", "steps": [] }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "up{job=\"libnovel-runner\"}",
          "legendFormat": "runner",
          "instant": true
        }
      ]
    },
    {
      "id": 10,
      "type": "timeseries",
      "title": "Task Throughput (per minute)",
      "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
      "options": {
        "tooltip": { "mode": "multi" },
        "legend": { "displayMode": "list", "placement": "bottom" }
      },
      "fieldConfig": {
        "defaults": {
          "unit": "ops",
          "custom": { "lineWidth": 2, "fillOpacity": 10 }
        },
        "overrides": [
          { "matcher": { "id": "byName", "options": "failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
          { "matcher": { "id": "byName", "options": "completed" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
        ]
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "rate(runner_tasks_completed_total[5m]) * 60",
          "legendFormat": "completed"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "rate(runner_tasks_failed_total[5m]) * 60",
          "legendFormat": "failed"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "runner_tasks_running",
          "legendFormat": "running"
        }
      ]
    },
    {
      "id": 11,
      "type": "timeseries",
      "title": "Audio Task Span Latency (p50 / p95 / p99)",
      "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
      "description": "End-to-end latency of runner.audio_task spans from Tempo span metrics.",
      "options": {
        "tooltip": { "mode": "multi" },
        "legend": { "displayMode": "list", "placement": "bottom" }
      },
      "fieldConfig": {
        "defaults": {
          "unit": "s",
          "custom": { "lineWidth": 2, "fillOpacity": 10 }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.50, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.audio_task\"}[5m])) by (le))",
          "legendFormat": "p50"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.audio_task\"}[5m])) by (le))",
          "legendFormat": "p95"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.audio_task\"}[5m])) by (le))",
          "legendFormat": "p99"
        }
      ]
    },
    {
      "id": 20,
      "type": "timeseries",
      "title": "Scrape Task Span Latency (p50 / p95 / p99)",
      "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
      "description": "End-to-end latency of runner.scrape_task spans from Tempo span metrics.",
      "options": {
        "tooltip": { "mode": "multi" },
        "legend": { "displayMode": "list", "placement": "bottom" }
      },
      "fieldConfig": {
        "defaults": {
          "unit": "s",
          "custom": { "lineWidth": 2, "fillOpacity": 10 }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.50, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.scrape_task\"}[5m])) by (le))",
          "legendFormat": "p50"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.scrape_task\"}[5m])) by (le))",
          "legendFormat": "p95"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service=\"runner\", span_name=\"runner.scrape_task\"}[5m])) by (le))",
          "legendFormat": "p99"
        }
      ]
    },
    {
      "id": 21,
      "type": "timeseries",
      "title": "Audio vs Scrape Task Rate",
      "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
      "description": "Relative throughput of audio generation vs book scraping.",
      "options": {
        "tooltip": { "mode": "multi" },
        "legend": { "displayMode": "list", "placement": "bottom" }
      },
      "fieldConfig": {
        "defaults": {
          "unit": "ops",
          "custom": { "lineWidth": 2, "fillOpacity": 10 }
        }
      },
      "targets": [
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "sum(rate(traces_spanmetrics_calls_total{service=\"runner\", span_name=\"runner.audio_task\"}[5m]))",
          "legendFormat": "audio tasks/s"
        },
        {
          "datasource": { "type": "prometheus", "uid": "prometheus" },
          "expr": "sum(rate(traces_spanmetrics_calls_total{service=\"runner\", span_name=\"runner.scrape_task\"}[5m]))",
          "legendFormat": "scrape tasks/s"
        }
      ]
    },
    {
      "id": 30,
      "type": "logs",
      "title": "Runner Logs (errors & warnings)",
      "gridPos": { "x": 0, "y": 20, "w": 24, "h": 10 },
      "options": {
        "showTime": true,
        "showLabels": false,
        "showCommonLabels": false,
        "wrapLogMessage": true,
        "prettifyLogMessage": true,
        "enableLogDetails": true,
        "sortOrder": "Descending",
        "dedupStrategy": "none"
      },
      "targets": [
        {
          "datasource": { "type": "loki", "uid": "loki" },
          "expr": "{service_name=\"runner\"}",
          "legendFormat": ""
        }
      ]
    },
    {
      "id": 31,
      "type": "logs",
      "title": "Runner Logs (all)",
      "gridPos": { "x": 0, "y": 30, "w": 24, "h": 10 },
      "options": {
        "showTime": true,
        "showLabels": false,
        "showCommonLabels": false,
        "wrapLogMessage": true,
        "prettifyLogMessage": true,
        "enableLogDetails": true,
        "sortOrder": "Descending",
        "dedupStrategy": "none"
      },
      "targets": [
        {
          "datasource": { "type": "loki", "uid": "loki" },
          "expr": "{service_name=\"runner\"}",
          "legendFormat": ""
        }
      ]
    }
  ]
}