apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
- name: Elasticsearch
type: elasticsearch
access: proxy
url: http://elasticsearch:9200
database: "logstash-*"
jsonData:
esVersion: "8.0.0"
timeField: "@timestamp"
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
dashboard:
id: null
title: My Dashboard
tags:
- monitoring
timezone: utc
refresh: 30s
time:
from: now-6h
to: now
panels:
- title: CPU Usage
type: timeseries
gridPos:
h: 8
w: 12
x: 0
y: 0
targets:
- expr: rate(node_cpu_seconds_total{mode!="idle"}[5m])
legendFormat: "{{instance}} {{mode}}"
timeseries: time-series line/area charts
stat: single big value with optional sparkline
gauge: gauge needle for single value
table: tabular data display
bargauge: horizontal/vertical bar gauges
heatmap: density heatmaps
piechart: pie or donut charts
logs: log viewer with label highlighting
traces: distributed trace visualization
canvas: freeform visual elements
geomap: geographic map panel
text: static markdown content
rate(http_requests_total[5m])
sum by (status) (rate(http_requests_total[5m]))
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
topk(10, sum by (path) (rate(http_requests_total[5m])))
avg_over_time(up[1h])
count by (job) (up == 0)
increase(http_requests_total[1h])
label_replace(metric, "dst", "$1", "src", "(.*)")
templating:
list:
- name: datasource
type: datasource
query: prometheus
current:
text: Prometheus
value: Prometheus
- name: job
type: query
datasource: $datasource
query: label_values(up, job)
sort: 1
- name: instance
type: query
datasource: $datasource
query: label_values(up{job="$job"}, instance)
refresh: 2
- name: interval
type: interval
query: 1m,5m,10m,30m,1h
auto: true
auto_count: 30
auto_min: 10s
annotations:
list:
- name: Deployments
datasource: Prometheus
enable: true
expr: time() bool on() (changes(deploy_count_total[1m]) > 0)
titleFormat: Deploy
tags:
- deploy
- name: Outages
type: tags
tags:
- outage
apiVersion: 1
groups:
- orgId: 1
name: system-alerts
interval: 1m
rules:
- uid: abc123
title: High CPU Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 90
type: gt
noDataState: OK
execErrState: Alerting
for: 5m
annotations:
summary: "CPU usage above 90% on {{ $labels.instance }}"
notifiers:
- name: Slack
type: slack
uid: slack-1
settings:
url: https://hooks.slack.com/services/XXX/YYY/ZZZ
recipient: "#alerts"
- name: Email
type: email
uid: email-1
settings:
addresses: oncall@example.com
- name: PagerDuty
type: pagerduty
uid: pd-1
settings:
integrationKey: YOUR_KEY
severity: critical
links:
- title: Runbook
type: link
url: https://wiki.example.com/runbooks/${__panel.id}
targetBlank: true
- title: Trace
type: link
url: /d/traces?var_trace_id=${__data.fields.traceID}
asDropdown: true
transformations:
- id: merge
options: {}
- id: organize
options:
excludeByName:
time: true
renameByName:
value: Requests
- id: calculateField
options:
mode: reduceRow
reduce:
reducer: sum
alias: Total
- id: filterByValue
options:
type: include
filters:
- fieldName: Total
config:
id: greater
value:
- 100
http_error_rate: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
p95_latency: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
)
memory_usage: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/
node_memory_MemTotal_bytes * 100
saturation: |
sum(rate(container_cpu_usage_seconds_total[5m]))
/
sum(kube_pod_container_resource_limits{resource="cpu"})
global_shortcuts:
"/": open dashboard search
"Ctrl+K": open command palette
"Ctrl+S": save dashboard
"d+r": refresh all panels
"d+z": toggle kiosk/zen mode
"d+l": toggle panel legend
"t+t": open time picker
"Escape": exit fullscreen/kiosk
"Ctrl+H": hide panel menu
panel_edit:
"e": toggle panel edit mode
"v": toggle panel inspect
"ps": open panel search
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
- name: Elasticsearch
type: elasticsearch
access: proxy
url: http://elasticsearch:9200
database: "logstash-*"
jsonData:
esVersion: "8.0.0"
timeField: "@timestamp"
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
dashboard:
id: null
title: My Dashboard
tags:
- monitoring
timezone: utc
refresh: 30s
time:
from: now-6h
to: now
panels:
- title: CPU Usage
type: timeseries
gridPos:
h: 8
w: 12
x: 0
y: 0
targets:
- expr: rate(node_cpu_seconds_total{mode!="idle"}[5m])
legendFormat: "{{instance}} {{mode}}"
timeseries: 时序折线/面积图
stat: 单个大数值,可选迷你图
gauge: 仪表盘指针显示单值
table: 表格数据展示
bargauge: 水平/垂直柱状仪表
heatmap: 密度热力图
piechart: 饼图或环形图
logs: 日志查看器,支持标签高亮
traces: 分布式链路追踪可视化
canvas: 自由画布视觉元素
geomap: 地理地图面板
text: 静态 Markdown 内容
rate(http_requests_total[5m])
sum by (status) (rate(http_requests_total[5m]))
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
topk(10, sum by (path) (rate(http_requests_total[5m])))
avg_over_time(up[1h])
count by (job) (up == 0)
increase(http_requests_total[1h])
label_replace(metric, "dst", "$1", "src", "(.*)")
templating:
list:
- name: datasource
type: datasource
query: prometheus
current:
text: Prometheus
value: Prometheus
- name: job
type: query
datasource: $datasource
query: label_values(up, job)
sort: 1
- name: instance
type: query
datasource: $datasource
query: label_values(up{job="$job"}, instance)
refresh: 2
- name: interval
type: interval
query: 1m,5m,10m,30m,1h
auto: true
auto_count: 30
auto_min: 10s
annotations:
list:
- name: Deployments
datasource: Prometheus
enable: true
expr: time() bool on() (changes(deploy_count_total[1m]) > 0)
titleFormat: Deploy
tags:
- deploy
- name: Outages
type: tags
tags:
- outage
apiVersion: 1
groups:
- orgId: 1
name: system-alerts
interval: 1m
rules:
- uid: abc123
title: High CPU Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator:
params:
- 90
type: gt
noDataState: OK
execErrState: Alerting
for: 5m
annotations:
summary: "CPU usage above 90% on {{ $labels.instance }}"
notifiers:
- name: Slack
type: slack
uid: slack-1
settings:
url: https://hooks.slack.com/services/XXX/YYY/ZZZ
recipient: "#alerts"
- name: Email
type: email
uid: email-1
settings:
addresses: oncall@example.com
- name: PagerDuty
type: pagerduty
uid: pd-1
settings:
integrationKey: YOUR_KEY
severity: critical
links:
- title: Runbook
type: link
url: https://wiki.example.com/runbooks/${__panel.id}
targetBlank: true
- title: Trace
type: link
url: /d/traces?var_trace_id=${__data.fields.traceID}
asDropdown: true
transformations:
- id: merge
options: {}
- id: organize
options:
excludeByName:
time: true
renameByName:
value: Requests
- id: calculateField
options:
mode: reduceRow
reduce:
reducer: sum
alias: Total
- id: filterByValue
options:
type: include
filters:
- fieldName: Total
config:
id: greater
value:
- 100
http_error_rate: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
p95_latency: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
)
memory_usage: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/
node_memory_MemTotal_bytes * 100
saturation: |
sum(rate(container_cpu_usage_seconds_total[5m]))
/
sum(kube_pod_container_resource_limits{resource="cpu"})
global_shortcuts:
"/": 打开仪表板搜索
"Ctrl+K": 打开命令面板
"Ctrl+S": 保存仪表板
"d+r": 刷新所有面板
"d+z": 切换禅模式
"d+l": 切换面板图例
"t+t": 打开时间选择器
"Escape": 退出全屏/禅模式
"Ctrl+H": 隐藏面板菜单
panel_edit:
"e": 切换面板编辑模式
"v": 切换面板检查模式
"ps": 打开面板搜索