From 50dbd12f1255a3a785c8c85528383d7cc426a147 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 2 Feb 2026 16:16:37 +0300 Subject: [PATCH] change nginx (add correct cluster + hosts) --- nginx-metrics.json | 3575 +++++++++++++++++++++++--------------------- 1 file changed, 1885 insertions(+), 1690 deletions(-) diff --git a/nginx-metrics.json b/nginx-metrics.json index 3485cae..1b74ee0 100644 --- a/nginx-metrics.json +++ b/nginx-metrics.json @@ -12,27 +12,10 @@ "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "enable": true, - "expr": "sum(changes(nginx_ingress_controller_config_last_reload_successful_timestamp_seconds{instance!=\"unknown\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[30s])) by (controller_class)", - "hide": false, - "iconColor": "rgba(255, 96, 96, 1)", - "limit": 100, - "name": "Config Reloads", - "showIn": 0, - "step": "30s", - "tagKeys": "controller_class", - "tags": [], - "titleFormat": "Config Reloaded", - "type": "tags" } ] }, + "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, @@ -44,1117 +27,13 @@ "type": "prometheus", "uid": "prom" }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 20, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "round(sum(irate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", - "format": "time_series", - "intervalFactor": 1, - "range": true, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Request Volume", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 0 - }, - "id": 82, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",state=\"active\"}[2m]))", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Connections", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 95 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 99 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 12, - "y": 0 - }, - "id": 21, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"[4-5].*\"}[2m])) / sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "intervalFactor": 1, - "range": true, - "refId": "A", - "step": 4 - } - ], - "title": "Controller Success Rate (non-4|5xx responses)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 18, - "y": 0 - }, - "id": 81, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "sum" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg(irate(nginx_ingress_controller_success{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[1m])) * 60", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Config Reloads", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 0 - }, - "id": 83, - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "count(nginx_ingress_controller_config_last_reload_successful{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "refId": "A", - "step": 4 - } - ], - "title": "Last Config Failed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqps" - }, - "overrides": [ - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsZero", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 3 - }, - "id": 86, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "round(sum(irate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "metric": "network", - "refId": "A", - "step": 10 - } - ], - "title": "Ingress Request Volume", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "max - istio-proxy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890f02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - master" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - prometheus" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsNull", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 3 - }, - "id": 87, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "asc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\",status!~\"[4-5].*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_requests{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",exported_namespace=~\"$exported_namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - } - ], - "title": "Ingress Success Rate (non-4|5xx responses)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 32, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false, - "width": 200 - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum (irate (nginx_ingress_controller_request_size_sum{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Received", - "metric": "network", - "refId": "A", - "step": 10 - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "- sum (irate (nginx_ingress_controller_response_size_sum{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", - "format": "time_series", - "hide": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Sent", - "metric": "network", - "range": true, - "refId": "B", - "step": 10 - } - ], - "title": "Network I/O pressure", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "max - istio-proxy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890f02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - master" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "max - prometheus" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#bf1b00", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 77, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false, - "width": 200 - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}) ", - "format": "time_series", - "instant": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "nginx", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - } - ], - "title": "Average Memory Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "cores", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "line+area" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "transparent", - "value": 0 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 79, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "12.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "avg (rate (nginx_ingress_controller_nginx_process_cpu_seconds_total{cluster=~\"$cluster\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m])) ", - "format": "time_series", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "nginx", - "metric": "container_cpu", - "range": true, - "refId": "A", - "step": 10 - } - ], - "title": "Average CPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "description": "This data is real time, independent of dashboard time range", + "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, + "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", @@ -1162,199 +41,37 @@ { "color": "green", "value": 0 - }, - { - "color": "red", - "value": 80 } ] - } + }, + "unit": "none" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "ingress" - }, - "properties": [ - { - "id": "displayName", - "value": "Ingress" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #A" - }, - "properties": [ - { - "id": "displayName", - "value": "P50 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #B" - }, - "properties": [ - { - "id": "displayName", - "value": "P90 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #C" - }, - "properties": [ - { - "id": "displayName", - "value": "P99 Latency" - }, - { - "id": "unit", - "value": "dtdurations" - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #D" - }, - "properties": [ - { - "id": "displayName", - "value": "IN" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)" - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - }, - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #E" - }, - "properties": [ - { - "id": "displayName", - "value": "OUT" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 3, + "w": 4, "x": 0, - "y": 16 + "y": 0 }, - "id": 75, + "id": 23, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true }, "pluginVersion": "12.1.0", "targets": [ @@ -1364,87 +81,400 @@ "uid": "prom" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le, ingress))", - "format": "table", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ destination_service }}", - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(irate(nginx_ingress_controller_request_size_sum{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (ingress)", - "format": "table", - "hide": false, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", "instant": true, "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "expr": "sum(irate(nginx_ingress_controller_response_size_sum{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (ingress)", - "format": "table", - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{ ingress }}", - "refId": "E" + "legendFormat": "{{name}}", + "refId": "A" } ], - "title": "Ingress Percentile Response Times and Transfer Rates", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" + "title": "Name", + "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prom" }, + "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "": { + "text": "" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "prefix:P" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 0 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "P-State", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Draw %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { @@ -1459,7 +489,7 @@ "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,47 +503,53 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { - "mode": "off" + "mode": "line+area" } }, "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "transparent", "value": 0 }, + { + "color": "orange", + "value": 0.7 + }, { "color": "red", - "value": 80 + "value": 0.9 } ] }, - "unit": "s" + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 24 + "h": 5, + "w": 6, + "x": 18, + "y": 0 }, - "id": 91, + "id": 11, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": true + "showLegend": false }, "tooltip": { "hideZeros": false, @@ -1528,51 +564,14 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", "exemplar": true, - "expr": "histogram_quantile(0.80, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", "interval": "", - "intervalFactor": 1, - "legendFormat": "P80", - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "exemplar": true, - "expr": "histogram_quantile(0.90, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "P90", - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "editorMode": "code", - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "P99", - "refId": "E" + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "title": "Ingress Percentile Response Times (Ingress Namespaces)", + "title": "Memory Utilization %", "type": "timeseries" }, { @@ -1580,65 +579,685 @@ "type": "prometheus", "uid": "prom" }, - "description": "", + "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{driver_version}}", + "refId": "A" + } + ], + "title": "Driver Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The BIOS of the GPU board.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{vbios_version}}", + "refId": "A" + } + ], + "title": "Vbios Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "Not Active" + }, + "1": { + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "Idle", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Thermal Slowdown", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Power Cap", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "App Clocks Setting", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Power Brake", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Thermal Slowdown", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "Sync Boost", + "refId": "G" + } + ], + "title": "Throttle Reasons", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of memory clock / Maximum frequency of memory clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 33, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Total memory allocated by active contexts / Total installed GPU memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 25, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Allocation %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" } - } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 24 + "h": 5, + "w": 6, + "x": 18, + "y": 5 }, - "id": 89, + "id": 10, "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": {}, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Warm", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "reverse": false, - "unit": "s" + "hideZeros": false, + "mode": "multi", + "sort": "none" } }, "pluginVersion": "12.1.0", @@ -1648,39 +1267,62 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", "exemplar": true, - "expr": "sum(increase(nginx_ingress_controller_request_duration_seconds_bucket{cluster=~\"$cluster\",ingress!=\"\",controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\",exported_namespace=~\"$exported_namespace\"}[2m])) by (le)", - "format": "heatmap", + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", "interval": "", - "legendFormat": "{{le}}", - "range": true, + "legendFormat": "", "refId": "A" } ], - "title": "Ingress Request Latency Heatmap (Ingress Namespaces)", - "type": "heatmap" + "title": "GPU Utilization %", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prom" }, + "description": "Total memory allocated by active contexts.", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "inspect": false + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - "decimals": 2, - "displayName": "", "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1694,84 +1336,29 @@ } ] }, - "unit": "short" + "unit": "decbytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Last *" - }, - "properties": [ - { - "id": "displayName", - "value": "TTL" - }, - { - "id": "unit", - "value": "s" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 0 - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 691200 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Field" - }, - "properties": [ - { - "id": "displayName", - "value": "Host" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 5, + "w": 6, "x": 0, - "y": 31 + "y": 10 }, - "id": 85, + "id": 17, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, - "showHeader": true + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } }, "pluginVersion": "12.1.0", "targets": [ @@ -1780,43 +1367,741 @@ "type": "prometheus", "uid": "prom" }, - "editorMode": "code", - "expr": "avg(nginx_ingress_controller_ssl_expire_time_seconds{cluster=~\"$cluster\",kubernetes_pod_name=~\"$controller\",namespace=~\"$namespace\",ingress=~\"$ingress\"}) by (host) - time()", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ host }}", - "metric": "gke_letsencrypt_cert_expiration", - "range": true, - "refId": "A", - "step": 1 + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "title": "Ingress Certificate Expiry", - "transformations": [ - { - "id": "reduce", - "options": { - "includeTimeField": false, - "labelsToFields": false, - "reducers": [ - "lastNotNull" + "title": "Memory Allocation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } ] - } + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" } ], - "type": "table" + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Power Draw", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": 0 + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of graphics (shader) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Graphics Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of video encoder/decoder clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 15 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Video Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of SM (Streaming Multiprocessor) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 15 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "SM Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "description": "Current frequency of memory clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 15 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Clock Speed", + "type": "timeseries" } ], "preload": false, - "refresh": "5s", + "refresh": "10s", "schemaVersion": 41, "tags": [], "templating": { "list": [ { "current": { - "text": "yandex", - "value": "yandex" + "text": "yandex-prod", + "value": "yandex-prod" }, "definition": "label_values(kube_node_info,cluster)", "label": "Cluster", @@ -1832,122 +2117,45 @@ "type": "query" }, { - "allValue": ".*", "current": { - "text": "All", - "value": "$__all" + "text": "cl1h3aok1ph8647rkppb-aryg", + "value": "cl1h3aok1ph8647rkppb-aryg" }, "datasource": { "type": "prometheus", "uid": "prom" }, - "definition": "label_values(nginx_ingress_controller_config_hash{cluster=~\"$cluster\"},controller_namespace)", - "includeAll": true, - "label": "Namespace", - "name": "namespace", + "definition": "label_values(nvidia_smi_index{cluster=~\"$cluster\"},instance)", + "includeAll": false, + "label": "Host", + "name": "node", "options": [], "query": { "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{cluster=~\"$cluster\"},controller_namespace)", + "query": "label_values(nvidia_smi_index{cluster=~\"$cluster\"},instance)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "type": "query" }, { - "allValue": ".*", "current": { - "text": "All", - "value": "$__all" + "text": "1200859e-158e-e077-b06b-0c526fe106ca", + "value": "1200859e-158e-e077-b06b-0c526fe106ca" }, "datasource": { "type": "prometheus", "uid": "prom" }, - "definition": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", cluster=\"$cluster\"},controller_class)", - "includeAll": true, - "label": "Controller Class", - "name": "controller_class", + "definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "includeAll": false, + "label": "GPU", + "name": "gpu", "options": [], "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", cluster=\"$cluster\"},controller_class)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", controller_class=~\"$controller_class\", cluster=~\"$cluster\"},controller_pod)", - "includeAll": true, - "label": "Controller", - "name": "controller", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\", controller_class=~\"$controller_class\", cluster=~\"$cluster\"},controller_pod)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", cluster=~\"$cluster\"},exported_namespace)", - "includeAll": true, - "label": "Ingress Namespace", - "name": "exported_namespace", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", cluster=~\"$cluster\"},exported_namespace)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "current": { - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "prom" - }, - "definition": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\", cluster=~\"$cluster\"},ingress)", - "includeAll": true, - "label": "Ingress", - "name": "ingress", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\", cluster=~\"$cluster\"},ingress)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", @@ -1957,25 +2165,12 @@ ] }, "time": { - "from": "now-1h", + "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "2m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "browser", - "title": "NGINX Ingress controller metrics", - "uid": "ec8046db-ingress-metrics", - "version": 6 + "timepicker": {}, + "timezone": "", + "title": "Nvidia GPU Metrics", + "uid": "2e8d5337-fdc5-46e9-96db-fb37d6271fb2", + "version": 4 } \ No newline at end of file