diff --git a/gorush.json b/gorush.json new file mode 100644 index 0000000..3c7d6aa --- /dev/null +++ b/gorush.json @@ -0,0 +1,1558 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 50, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [], + "title": "Global", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "общий процент ошибок в задачах", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])) + sum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Error rate (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "глобальный success rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])) + sum(rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Success rate (%)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "сколько пушей/сек", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(gorush_total_push_count{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Push throughput", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "рабочие воркеры в данный момент", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "gorush_busy_workers{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Busy workers", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "rate success tasks" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(gorush_submitted_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])\r\n", + "legendFormat": "rate submitted tasks", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "rate(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "hide": false, + "instant": false, + "legendFormat": "rate success tasks", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "rate(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])", + "hide": false, + "instant": false, + "legendFormat": "rate failure tasks", + "range": true, + "refId": "C" + } + ], + "title": "Rates", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "за выбранное окно времени мы догоняем или отстаём", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(gorush_submitted_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n-\r\nsum(increase(gorush_success_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n-\r\nsum(increase(gorush_failure_tasks{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Backlog “submitted - (success+failure)”", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "description": "поcледние ошибки", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 20, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{service_name=\"gorush\", cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}\r\n|~ \"(?i)(${search}|error|err=|fatal|panic|exception)\"\r\n", + "queryType": "range", + "refId": "A" + } + ], + "title": "Last errors", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 6, + "panels": [], + "title": "Android", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "legendFormat": "android success rate", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "editorMode": "code", + "expr": "sum(rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "hide": false, + "instant": false, + "legendFormat": "android fail rate", + "range": true, + "refId": "B" + } + ], + "title": "Android throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no android rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 25 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_android_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Android success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in android", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 25 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_android_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Android fails by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 12, + "panels": [], + "title": "IOS", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 35 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "ios success rate", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_ios_error{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "ios fail rate", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "IOS throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no ios rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 35 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_ios_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_ios_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "IOS success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in ios", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 35 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_ios_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "IOS fails by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 16, + "panels": [], + "title": "Huawei", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 45 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "huawei success rate", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "huawei fail rate", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Huawei throughput: success vs fail / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "no Huawei rate in interval", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 9, + "y": 45 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "100 *\r\nsum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n/\r\nclamp_min(\r\n sum(rate(gorush_huawei_success{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n+ sum(rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms])),\r\n 1\r\n)\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Huawei success rate (%)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "no fails in huawei", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 13, + "y": 45 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (pod) (rate(gorush_huawei_fail{cluster=~\"$cluster\", instance=~\"$node\", pod=~\"$pod\"}[$__rate_interval_ms]))\r\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Huawei fails by pod", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "local", + "value": "local" + }, + "definition": "label_values(kube_node_info,cluster)", + "label": "Cluster", + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "rancher2", + "value": "rancher2" + }, + "definition": "label_values(gorush_submitted_tasks{cluster=\"$cluster\"},instance)", + "label": "Node", + "name": "node", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gorush_submitted_tasks{cluster=\"$cluster\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": "", + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(gorush_total_push_count{instance=\"$node\"},pod)", + "description": "", + "includeAll": true, + "label": "Pod", + "name": "pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gorush_total_push_count{instance=\"$node\"},pod)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "failed", + "value": "failed" + }, + "description": "search in logs", + "label": "search", + "name": "search", + "options": [ + { + "selected": true, + "text": "failed", + "value": "failed" + } + ], + "query": "failed", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "gorush Dashboard", + "uid": "107d4c76-9dc0-4393-87ea-68c2d73ca507", + "version": 44 +} \ No newline at end of file diff --git a/nvidia_dcgm.json b/nvidia_dcgm.json new file mode 100644 index 0000000..2e64828 --- /dev/null +++ b/nvidia_dcgm.json @@ -0,0 +1,888 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 52, + "links": [], + "panels": [ + { + "datasource": { + "uid": "$datasource" + }, + "description": "avg temp per gpu", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 14, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", + "interval": "", + "legendFormat": "GPU: {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Avg. Temp", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "#E0B400", + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 1800.0001 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "instant": true, + "interval": "", + "legendFormat": "GPU: {{gpu}}", + "range": false, + "refId": "A" + } + ], + "title": "GPU Power Total", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 12, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", + "instant": false, + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_UTIL{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 10, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 18, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_FB_USED{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Framebuffer Mem Used", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 2, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_SM_CLOCK{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n) * 1000000\r\n", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU SM Clocks", + "type": "timeseries" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 4, + "options": { + "dataLinks": [], + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max by (instance, gpu) (\r\n DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "Tensor Core Utilization", + "type": "timeseries" + } + ], + "preload": false, + "refresh": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "Mimir", + "value": "prom" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "includeAll": true, + "multi": true, + "name": "gpu", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", + "includeAll": true, + "label": "pod", + "name": "pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "NVIDIA DCGM Exporter Dashboard", + "uid": "nvidia_dcgm_exporter", + "version": 34 +} \ No newline at end of file