{ "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 52, "links": [], "panels": [ { "datasource": { "uid": "$datasource" }, "description": "avg temp per gpu", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "#EAB839", "value": 83 }, { "color": "red", "value": 87 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "id": 14, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "mean" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "avg by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", "interval": "", "legendFormat": "GPU: {{gpu}}", "range": true, "refId": "A" } ], "title": "GPU Avg. Temp", "type": "gauge" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "fixedColor": "#E0B400", "mode": "thresholds" }, "mappings": [], "max": 2400, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "#EAB839", "value": 1800.0001 }, { "color": "red", "value": 2200 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "id": 16, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "sum" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "sum by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", "instant": true, "interval": "", "legendFormat": "GPU: {{gpu}}", "range": false, "refId": "A" } ], "title": "GPU Power Total", "type": "gauge" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "id": 12, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\", pod=~\"$pod\"}\r\n)\r\n", "instant": false, "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" } ], "title": "GPU Temperature", "type": "timeseries" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "id": 6, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_GPU_UTIL{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", "interval": "", "legendFormat": "GPU {{gpu}}", "range": true, "refId": "A" } ], "title": "GPU Utilization", "type": "timeseries" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "id": 10, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_POWER_USAGE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", "interval": "", "legendFormat": "GPU {{gpu}}", "range": true, "refId": "A" } ], "title": "GPU Power Usage", "type": "timeseries" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "id": 18, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_FB_USED{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", "interval": "", "legendFormat": "GPU {{gpu}}", "range": true, "refId": "A" } ], "title": "GPU Framebuffer Mem Used", "type": "timeseries" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "hertz" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "id": 2, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_DEV_SM_CLOCK{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n) * 1000000\r\n", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "GPU {{gpu}}", "range": true, "refId": "A" } ], "title": "GPU SM Clocks", "type": "timeseries" }, { "datasource": { "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "id": 4, "options": { "dataLinks": [], "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "12.1.0", "targets": [ { "datasource": { "uid": "$datasource" }, "editorMode": "code", "expr": "max by (instance, gpu) (\r\n DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{\r\n instance=~\"$instance\",\r\n gpu=~\"$gpu\",\r\n pod=~\"$pod\"\r\n }\r\n)\r\n", "interval": "", "legendFormat": "GPU {{gpu}}", "range": true, "refId": "A" } ], "title": "Tensor Core Utilization", "type": "timeseries" } ], "preload": false, "refresh": false, "schemaVersion": 41, "tags": [], "templating": { "list": [ { "current": { "text": "Mimir", "value": "prom" }, "includeAll": false, "name": "datasource", "options": [], "query": "prometheus", "refresh": 1, "regex": "", "type": "datasource" }, { "current": { "text": [ "All" ], "value": [ "$__all" ] }, "datasource": "$datasource", "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "includeAll": true, "multi": true, "name": "instance", "options": [], "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": { "text": [ "All" ], "value": [ "$__all" ] }, "datasource": "$datasource", "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", "includeAll": true, "multi": true, "name": "gpu", "options": [], "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": { "text": "All", "value": "$__all" }, "definition": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", "includeAll": true, "label": "pod", "name": "pod", "options": [], "query": { "qryType": 1, "query": "label_values(DCGM_FI_DEV_GPU_TEMP{instance=\"$instance\"},pod)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "type": "query" } ] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "nvidia_dcgm_exporter", "version": 34 }