diff --git a/gorush.json b/gorush.json new file mode 100644 index 0000000..3c7d6aa --- /dev/null +++ b/gorush.json @@ -0,0 +1,1558 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 50, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [], + "title": "Global", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "общий процент ошибок в задачах", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])) + sum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Error rate (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "глобальный success rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])) + sum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Success rate (%)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "сколько пушей/сек", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(gorush_total_push_count{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Push throughput", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "рабочие воркеры в данный момент", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "gorush_busy_workers{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Busy workers", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "rate success tasks" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(gorush_submitted_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])\r\n", + "legendFormat": "rate submitted tasks", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "hide": false, + "instant": false, + "legendFormat": "rate success tasks", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "hide": false, + "instant": false, + "legendFormat": "rate failure tasks", + "range": true, + "refId": "C" + } + ], + "title": "Rates", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "за выбранное окно времени мы догоняем или отстаём", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(gorush_submitted_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n-\r\nsum(increase(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n-\r\nsum(increase(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Backlog “submitted - (success+failure)”", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "description": "поcледние ошибки", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 20, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{service_name=\"gorush\", cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}\r\n|~ \"(?i)(${search}|error|err=|fatal|panic|exception)\"\r\n", + "queryType": "range", + "refId": "A" + } + ], + "title": "Last errors", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 6, + "panels": [], + "title": "Android", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "legendFormat": "android success rate", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "sum(rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "hide": false, + "instant": false, + "legendFormat": "android fail rate", + "range": true, + "refId": "B" + } + ], + "title": "Android throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no android rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 25 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Android success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in android", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 25 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Android fails by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 12, + "panels": [], + "title": "IOS", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 35 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "ios success rate", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_ios_error{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "ios fail rate", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "IOS throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no ios rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 35 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_ios_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "IOS success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in ios", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 35 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_ios_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "IOS fails by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 16, + "panels": [], + "title": "Huawei", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 45 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "huawei success rate", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "huawei fail rate", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Huawei throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no Huawei rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 45 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Huawei success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in huawei", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 45 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Huawei fails by pod", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "local", + "value": "local" + }, + "definition": "label_values(kube_node_info,cluster)", + "label": "Cluster", + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "rancher2", + "value": "rancher2" + }, + "definition": "label_values(gorush_submitted_tasks{cluster=\"$cluster\"},instance)", + "label": "Node", + "name": "node", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gorush_submitted_tasks{cluster=\"$cluster\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": "", + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(gorush_total_push_count{instance=\"$node\"},pod)", + "description": "", + "includeAll": true, + "label": "Pod", + "name": "pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gorush_total_push_count{instance=\"$node\"},pod)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "failed", + "value": "failed" + }, + "description": "search in logs", + "label": "search", + "name": "search", + "options": [ + { + "selected": true, + "text": "failed", + "value": "failed" + } + ], + "query": "failed", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "gorush Dashboard", + "uid": "107d4c76-9dc0-4393-87ea-68c2d73ca507", + "version": 44 +} \ No newline at end of file diff --git a/nginx-metrics.json b/nginx-metrics.json index 3485cae..1b74ee0 100644 --- a/nginx-metrics.json +++ b/nginx-metrics.json @@ -12,27 +12,10 @@ "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "enable": true, - "expr": "sum(changes(nginx_ingress_controller_config_last_reload_successful_timestamp_seconds{instance!=\"unknown\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[30s])) by (controller_class)", - "hide": false, - "iconColor": "rgba(255, 96, 96, 1)", - "limit": 100, - "name": "Config Reloads", - "showIn": 0, - "step": "30s", - "tagKeys": "controller_class", - "tags": [], - "titleFormat": "Config Reloaded", - "type": "tags" } ] }, + "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, @@ -44,1117 +27,13 @@ "type": "prometheus", "uid": "prom" }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 20, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "round(sum(irate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", - "format": "time_series", - "intervalFactor": 1, - "range": true, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Request Volume", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 0 - }, - "id": 82, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",state=\"active\"}[2m]))", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Connections", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 95 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 99 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 12, - "y": 0 - }, - "id": 21, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"[4-5].*\"}[2m])) / sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "intervalFactor": 1, - "range": true, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Success Rate (non-4|5xx responses)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 18, - "y": 0 - }, - "id": 81, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "sum" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg(irate(nginx_ingress_controller_success{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[1m])) * 60", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Config Reloads", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 0 - }, - "id": 83, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "count(nginx_ingress_controller_config_last_reload_successful{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Last Config Failed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqps" - }, - "overrides": [ - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsZero", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 3 - }, - "id": 86, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "round(sum(irate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "metric": "network", - "refId": "A", - "step": 10 - } - ], - "title": "Ingress Request Volume", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "max - istio-proxy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890f02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - master" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - prometheus" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsNull", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 3 - }, - "id": 87, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "asc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\",status!~\"[4-5].*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - } - ], - "title": "Ingress Success Rate (non-4|5xx responses)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 32, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false, - "width": 200 - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum (irate (nginx_ingress_controller_request_size_sum{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Received", - "metric": "network", - "refId": "A", - "step": 10 - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "- sum (irate (nginx_ingress_controller_response_size_sum{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "hide": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Sent", - "metric": "network", - "range": true, - "refId": "B", - "step": 10 - } - ], - "title": "Network I/O pressure", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "max - istio-proxy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890f02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - master" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - prometheus" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 77, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false, - "width": 200 - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}) ", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "nginx", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - } - ], - "title": "Average Memory Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "cores", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "line+area" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "transparent", - "value": 0 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 79, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg (rate (nginx_ingress_controller_nginx_process_cpu_seconds_total{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m])) ", - "format": "time_series", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "nginx", - "metric": "container_cpu", - "range": true, - "refId": "A", - "step": 10 - } - ], - "title": "Average CPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "description": "This data is real time, independent of dashboard time range", + "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, + "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", @@ -1162,199 +41,37 @@ { "color": "green", "value": 0 - }, - { - "color": "red", - "value": 80 } ] - } + }, + "unit": "none" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "ingress" - }, - "properties": [ - { - "id": "displayName", - "value": "Ingress" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #A" - }, - "properties": [ - { - "id": "displayName", - "value": "P50 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #B" - }, - "properties": [ - { - "id": "displayName", - "value": "P90 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #C" - }, - "properties": [ - { - "id": "displayName", - "value": "P99 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #D" - }, - "properties": [ - { - "id": "displayName", - "value": "IN" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)" - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - }, - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #E" - }, - "properties": [ - { - "id": "displayName", - "value": "OUT" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 3, + "w": 4, "x": 0, - "y": 16 + "y": 0 }, - "id": 75, + "id": 23, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true }, "pluginVersion": "12.1.0", "targets": [ @@ -1364,87 +81,400 @@ "uid": "prom" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ destination_service }}", - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(irate(nginx_ingress_controller_request_size_sum{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (ingress)", - "format": "table", - "hide": false, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", "instant": true, "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(irate(nginx_ingress_controller_response_size_sum{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (ingress)", - "format": "table", - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "E" + "legendFormat": "{{name}}", + "refId": "A" } ], - "title": "Ingress Percentile Response Times and Transfer Rates", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" + "title": "Name", + "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prom" }, + "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "": { + "text": "" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "prefix:P" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 0 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "P-State", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Draw %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { @@ -1459,7 +489,7 @@ "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,47 +503,53 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { - "mode": "off" + "mode": "line+area" } }, "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "transparent", "value": 0 }, + { + "color": "orange", + "value": 0.7 + }, { "color": "red", - "value": 80 + "value": 0.9 } ] }, - "unit": "s" + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 24 + "h": 5, + "w": 6, + "x": 18, + "y": 0 }, - "id": 91, + "id": 11, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": true + "showLegend": false }, "tooltip": { "hideZeros": false, @@ -1528,51 +564,14 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", "exemplar": true, - "expr": "histogram_quantile(0.80, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", "interval": "", - "intervalFactor": 1, - "legendFormat": "P80", - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "exemplar": true, - "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "P90", - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "P99", - "refId": "E" + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "title": "Ingress Percentile Response Times (Ingress Namespaces)", + "title": "Memory Utilization %", "type": "timeseries" }, { @@ -1580,65 +579,685 @@ "type": "prometheus", "uid": "prom" }, - "description": "", + "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{driver_version}}", + "refId": "A" + } + ], + "title": "Driver Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The BIOS of the GPU board.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{vbios_version}}", + "refId": "A" + } + ], + "title": "Vbios Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "Not Active" + }, + "1": { + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "Idle", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Thermal Slowdown", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Power Cap", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "App Clocks Setting", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Power Brake", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Thermal Slowdown", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "Sync Boost", + "refId": "G" + } + ], + "title": "Throttle Reasons", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of memory clock / Maximum frequency of memory clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 33, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Total memory allocated by active contexts / Total installed GPU memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 25, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Allocation %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" } - } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 24 + "h": 5, + "w": 6, + "x": 18, + "y": 5 }, - "id": 89, + "id": 10, "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": {}, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Warm", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "reverse": false, - "unit": "s" + "hideZeros": false, + "mode": "multi", + "sort": "none" } }, "pluginVersion": "12.1.0", @@ -1648,39 +1267,62 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", "exemplar": true, - "expr": "sum(increase(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le)", - "format": "heatmap", + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", "interval": "", - "legendFormat": "{{le}}", - "range": true, + "legendFormat": "", "refId": "A" } ], - "title": "Ingress Request Latency Heatmap (Ingress Namespaces)", - "type": "heatmap" + "title": "GPU Utilization %", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prom" }, + "description": "Total memory allocated by active contexts.", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "inspect": false + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - "decimals": 2, - "displayName": "", "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1694,84 +1336,29 @@ } ] }, - "unit": "short" + "unit": "decbytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Last *" - }, - "properties": [ - { - "id": "displayName", - "value": "TTL" - }, - { - "id": "unit", - "value": "s" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 691200 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Field" - }, - "properties": [ - { - "id": "displayName", - "value": "Host" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 5, + "w": 6, "x": 0, - "y": 31 + "y": 10 }, - "id": 85, + "id": 17, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, - "showHeader": true + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } }, "pluginVersion": "12.1.0", "targets": [ @@ -1780,43 +1367,741 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", - "expr": "avg(nginx_ingress_controller_ssl_expire_time_seconds{cluster=~\"$cluster\",kubernetes_pod_name=~\"$controller\",namespace=~\"$namespace\",ingress=~\"$ingress\"}) by (host) - time()", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ host }}", - "metric": "gke_letsencrypt_cert_expiration", - "range": true, - "refId": "A", - "step": 1 + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "title": "Ingress Certificate Expiry", - "transformations": [ - { - "id": "reduce", - "options": { - "includeTimeField": false, - "labelsToFields": false, - "reducers": [ - "lastNotNull" + "title": "Memory Allocation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } ] - } + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "type": "table" + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Power Draw", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of graphics (shader) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Graphics Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of video encoder/decoder clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 15 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Video Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of SM (Streaming Multiprocessor) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 15 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "SM Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of memory clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 15 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Clock Speed", + "type": "timeseries" } ], "preload": false, - "refresh": "5s", + "refresh": "10s", "schemaVersion": 41, "tags": [], "templating": { "list": [ { "current": { - "text": "yandex", - "value": "yandex" + "text": "yandex-prod", + "value": "yandex-prod" }, "definition": "label_values(kube_node_info,cluster)", "label": "Cluster", @@ -1832,122 +2117,45 @@ "type": "query" }, { - "allValue": ".*", "current": { - "text": "All", - "value": "$__all" + "text": "cl1h3aok1ph8647rkppb-aryg", + "value": "cl1h3aok1ph8647rkppb-aryg" }, "datasource": { "type": "prometheus", "uid": "prom" }, - "definition": "label_values(nginx_ingress_controller_config_hash{cluster=~\"$cluster\"},controller_namespace)", - "includeAll": true, - "label": "Namespace", - "name": "namespace", + "definition": "label_values(nvidia_smi_index{cluster=~\"$cluster\"},instance)", + "includeAll": false, + "label": "Host", + "name": "node", "options": [], "query": { "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{cluster=~\"$cluster\"},controller_namespace)", + "query": "label_values(nvidia_smi_index{cluster=~\"$cluster\"},instance)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "type": "query" }, { - "allValue": ".*", "current": { - "text": "All", - "value": "$__all" + "text": "1200859e-158e-e077-b06b-0c526fe106ca", + "value": "1200859e-158e-e077-b06b-0c526fe106ca" }, "datasource": { "type": "prometheus", "uid": "prom" }, - "definition": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", cluster=\"$cluster\"},controller_class)", - "includeAll": true, - "label": "Controller Class", - "name": "controller_class", + "definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "includeAll": false, + "label": "GPU", + "name": "gpu", "options": [], "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", cluster=\"$cluster\"},controller_class)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", controller_class=~\"$controller_class\", cluster=~\"$cluster\"},controller_pod)", - "includeAll": true, - "label": "Controller", - "name": "controller", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", controller_class=~\"$controller_class\", cluster=~\"$cluster\"},controller_pod)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", cluster=~\"$cluster\"},exported_namespace)", - "includeAll": true, - "label": "Ingress Namespace", - "name": "exported_namespace", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", cluster=~\"$cluster\"},exported_namespace)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\", cluster=~\"$cluster\"},ingress)", - "includeAll": true, - "label": "Ingress", - "name": "ingress", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\", cluster=~\"$cluster\"},ingress)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", @@ -1957,25 +2165,12 @@ ] }, "time": { - "from": "now-1h", + "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "2m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "browser", - "title": "NGINX Ingress controller metrics", - "uid": "ec8046db-ingress-metrics", - "version": 6 + "timepicker": {}, + "timezone": "", + "title": "Nvidia GPU Metrics", + "uid": "2e8d5337-fdc5-46e9-96db-fb37d6271fb2", + "version": 4 } \ No newline at end of file diff --git a/nvidia_dcgm.json b/nvidia_dcgm.json new file mode 100644 index 0000000..2e64828 --- /dev/null +++ b/nvidia_dcgm.json @@ -0,0 +1,888 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 52, + "links": [], + "panels": [ + { + "datasource": { + "uid": "$datasource" + }, + "description": "avg temp per gpu", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 14, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", + "interval": "", + "legendFormat": "GPU: {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Avg. Temp", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#E0B400", + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 1800.0001 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "instant": true, + "interval": "", + "legendFormat": "GPU: {{gpu}}", + "range": false, + "refId": "A" + } + ], + "title": "GPU Power Total", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 12, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", + "instant": false, + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_UTIL{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 10, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 18, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_FB_USED{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Framebuffer Mem Used", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 2, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_SM_CLOCK{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n) * 1000000\r\n", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU SM Clocks", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 4, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "Tensor Core Utilization", + "type": "timeseries" + } + ], + "preload": false, + "refresh": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "Mimir", + "value": "prom" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "includeAll": true, + "multi": true, + "name": "gpu", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", + "includeAll": true, + "label": "pod", + "name": "pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "NVIDIA DCGM Exporter Dashboard", + "uid": "nvidia_dcgm_exporter", + "version": 34 +} \ No newline at end of file diff --git a/tusd_metrics.json b/tusd_metrics.json index 1905cb8..e7fda41 100644 --- a/tusd_metrics.json +++ b/tusd_metrics.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 18, + "id": 47, "links": [], "panels": [ { @@ -43,7 +43,7 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "palette-classic-by-name" }, "custom": { "axisBorderShow": false, @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "bytes" }, "overrides": [] }, @@ -140,6 +141,7 @@ "type": "prometheus", "uid": "prom" }, + "description": "старт tusd", "fieldConfig": { "defaults": { "color": { @@ -164,7 +166,7 @@ "overrides": [] }, "gridPos": { - "h": 6, + "h": 7, "w": 14, "x": 10, "y": 1 @@ -201,107 +203,6 @@ "title": "Process Start Time (hours)", "type": "gauge" }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "description": "Показывает производительность GC и его частоту. ", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 14, - "x": 10, - "y": 7 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(go_gc_duration_seconds_sum{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}[5m])", - "legendFormat": "{{instance}} pod : {{pod}} :GC Time Rate (s/sec)", - "range": true, - "refId": "A" - }, - { - "editorMode": "code", - "expr": "go_memstats_next_gc_bytes{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}", - "legendFormat": "{{instance}} pod : {{pod}} : Next GC Threshold (bytes)", - "range": true, - "refId": "B" - } - ], - "title": "GC Duration & Next GC", - "type": "timeseries" - }, { "datasource": { "type": "prometheus", @@ -373,6 +274,197 @@ "title": "Open Connections", "type": "gauge" }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "доля времени, которое процесс тратит на GC (CPU-time на GC)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 10, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(go_gc_duration_seconds_sum{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}[5m]) * 100", + "legendFormat": "{{instance}} pod : {{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Объём heap-памяти, при достижении которого Go runtime запустит следующий garbage collection.\nНе отражает текущее потребление памяти.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-orange", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 17, + "y": 8 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "go_memstats_next_gc_bytes{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}", + "legendFormat": "{{instance}} pod : {{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Go next GC heap target", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -504,7 +596,7 @@ "h": 12, "w": 14, "x": 10, - "y": 14 + "y": 15 }, "id": 9, "options": { @@ -602,7 +694,7 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 10, "x": 0, "y": 18 @@ -699,7 +791,7 @@ "h": 6, "w": 24, "x": 0, - "y": 26 + "y": 27 }, "id": 10, "options": { @@ -755,6 +847,7 @@ "type": "prometheus", "uid": "prom" }, + "description": "использование CPU by TUSD", "fieldConfig": { "defaults": { "color": { @@ -814,7 +907,7 @@ "h": 7, "w": 8, "x": 0, - "y": 32 + "y": 33 }, "id": 4, "options": { @@ -908,7 +1001,7 @@ "h": 7, "w": 7, "x": 8, - "y": 32 + "y": 33 }, "id": 6, "options": { @@ -1001,7 +1094,8 @@ "value": 80 } ] - } + }, + "unit": "bytes" }, "overrides": [] }, @@ -1009,7 +1103,7 @@ "h": 7, "w": 9, "x": 15, - "y": 32 + "y": 33 }, "id": 8, "options": { @@ -1051,7 +1145,7 @@ "h": 1, "w": 24, "x": 0, - "y": 39 + "y": 40 }, "id": 20, "panels": [], @@ -1114,7 +1208,8 @@ "value": 80 } ] - } + }, + "unit": "ops" }, "overrides": [] }, @@ -1122,7 +1217,7 @@ "h": 8, "w": 12, "x": 0, - "y": 40 + "y": 41 }, "id": 22, "options": { @@ -1207,7 +1302,8 @@ "value": 80 } ] - } + }, + "unit": "ops" }, "overrides": [] }, @@ -1215,7 +1311,7 @@ "h": 8, "w": 12, "x": 12, - "y": 40 + "y": 41 }, "id": 21, "options": { @@ -1301,7 +1397,8 @@ "value": 80 } ] - } + }, + "unit": "ms" }, "overrides": [] }, @@ -1309,7 +1406,7 @@ "h": 5, "w": 24, "x": 0, - "y": 48 + "y": 49 }, "id": 11, "options": { @@ -1351,7 +1448,7 @@ "h": 1, "w": 24, "x": 0, - "y": 53 + "y": 54 }, "id": 23, "panels": [], @@ -1420,9 +1517,9 @@ }, "gridPos": { "h": 8, - "w": 18, + "w": 11, "x": 0, - "y": 54 + "y": 55 }, "id": 26, "options": { @@ -1464,6 +1561,78 @@ "title": "S3 Upload Semaphore Demand vs Limit", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "99-й перцентиль длительности операций записи на диск в S3.\nПоказывает худшие 1% disk write операций и используется для выявления деградаций и длинных хвостов задержек.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 11, + "y": 55 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": true, + "text": { + "titleSize": 11, + "valueSize": 50 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "tusd_s3_disk_write_duration_ms{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\", quantile=\"0.99\"}", + "hide": false, + "instant": false, + "legendFormat": "{{instance}} pod : {{pod}} : tusd Disk Write Time {{quantile}}", + "range": true, + "refId": "C" + } + ], + "title": "S3 disk write latency (p99)", + "type": "stat" + }, { "datasource": { "type": "prometheus", @@ -1487,7 +1656,8 @@ "value": 80 } ] - } + }, + "unit": "ms" }, "overrides": [] }, @@ -1495,7 +1665,7 @@ "h": 8, "w": 6, "x": 18, - "y": 54 + "y": 55 }, "id": 25, "options": { @@ -1590,7 +1760,7 @@ "h": 9, "w": 24, "x": 0, - "y": 62 + "y": 63 }, "id": 24, "options": { @@ -1619,112 +1789,13 @@ "title": "Request Duration (Avg, ms)", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "description": "Среднюю продолжительность записи на диск", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 15, - "w": 24, - "x": 0, - "y": 71 - }, - "id": 13, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": true, - "text": { - "titleSize": 15, - "valueSize": 25 - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "tusd_s3_disk_write_duration_ms_sum{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}", - "legendFormat": "{{instance}} pod : {{pod}} : Disk Write Duration sum (ms)", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "tusd_s3_disk_write_duration_ms_count{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}", - "hide": false, - "instant": false, - "legendFormat": "{{instance}} pod : {{pod}} : Disk Write Duration count (ms)", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "tusd_s3_disk_write_duration_ms{job=\"integrations/tusd\", instance=~\"$Instance\", pod=~\"$pod\"}", - "hide": false, - "instant": false, - "legendFormat": "{{instance}} pod : {{pod}} : tusd Disk Write Time {{quantile}}", - "range": true, - "refId": "C" - } - ], - "title": "S3 Disk operations", - "type": "stat" - }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 86 + "y": 72 }, "id": 19, "panels": [], @@ -1795,7 +1866,7 @@ "h": 8, "w": 12, "x": 0, - "y": 87 + "y": 73 }, "id": 16, "options": { @@ -1888,7 +1959,7 @@ "h": 8, "w": 12, "x": 12, - "y": 87 + "y": 73 }, "id": 17, "options": { @@ -1981,7 +2052,7 @@ "h": 5, "w": 24, "x": 0, - "y": 95 + "y": 81 }, "id": 27, "options": { @@ -2041,7 +2112,9 @@ { "current": { "text": "All", - "value": "$__all" + "value": [ + "$__all" + ] }, "definition": "label_values({cluster=~\"$cluster\", job=\"integrations/tusd\"},instance)", "includeAll": true, @@ -2060,7 +2133,9 @@ { "current": { "text": "All", - "value": "$__all" + "value": [ + "$__all" + ] }, "definition": "label_values({cluster=~\"$cluster\", job=\"integrations/tusd\"},pod)", "includeAll": true, @@ -2085,6 +2160,6 @@ "timepicker": {}, "timezone": "browser", "title": "Tusd Metrics", - "uid": "tusd-metric-new-try", - "version": 3 + "uid": "eeeebd69-5bb9-4715-feeewwds-544", + "version": 17 } \ No newline at end of file