环境
一台老centos6 docker1.7.1
没法docker-compose
密码加密生成
vim encrypt.py
import getpass
import bcrypt
password = getpass.getpass("password: ")
hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt())
print(hashed_password.decode())
然后生成加密
[root@i ~]# python3 encrypt.py
password:
$2b$12$WE7uLSpQJt8lSbiJE9VWOO7TMfsKzEahMSZB8DhiSrE.KkIaaTXNe
创建prometheus配置文件
mkdir -p /opt/prometheus
cd /opt/prometheus
#创建 webconfig.yml prometheus.yml
[root@01 prometheus]# cat webconfig.yml
basic_auth_users:
user1: $2b$12$WE7uLSpQJt8lSbiJE9VWOO7TMfsKzEahMSZB8DhiSrE.KkIaaTXNe
[root@01 prometheus]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanagerIP:9093']
basic_auth:
username: user1
password: password
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
#可以相对路径也可以绝对路径
rule_files:
- "/etc/prometheus/first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label to any timeseries scraped from this config.
- job_name: 'prometheus'
#明文密码 这个如果不配置 Targets页面prometheus显示不健康 401 Unauthorized
basic_auth:
username: user1
password: password
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
#监听的地址
- targets: ['localhost:9090']
#监控的本机 已经安装node-exporter
- job_name: 'centos6_test'
static_configs:
- targets: ['localhost:9100'] #node-exporter地址
labels:
appname: 'centos6' #添加的标签
要是上面复制出来编辑yaml格式不对 让openai 文心一言帮忙校准格式
发送后说 将上面的yaml文件转为正确的格式
规则rule文件示例
groups:
- name: Host
rules:
- alert: HostMemory Usage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机内存使用率超过90%."
value: "{{ $value }}"
- alert: HostCPU Usage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机CPU使用率超过80%."
value: "{{ $value }}"
- alert: HostLoad
expr: node_load5 > 20
for: 1m
labels:
name: Load
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "主机负载5分钟超过20."
value: "{{ $value }}"
- alert: HostFilesystem Usage
expr: (node_filesystem_size_bytes{device=~"/dev/.*"} - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 1m
labels:
name: Disk
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机 [{{ $labels.mountpoint }}] 分区使用超90%."
value: "{{ $value }}%"
- alert: HostDiskio writes
expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机 [{{ $labels.device }}] 磁盘1分钟平均写入IO负载较高."
value: "{{ $value }}iops"
- alert: HostDiskio reads
expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿机 [{{ $labels.device }}] 磁盘1分钟平均读取IO负载较高."
value: "{{ $value }}iops"
- alert: HostNetwork_receive
expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
value: "{{ $value }}3Mbps"
- alert: HostNetwork_transmit
expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: "{{ $labels.appname }}"
description: "宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
value: "{{ $value }}3Mbps"
node_filesystem_size_bytes{device=~"/dev/.*"} 防止查询出两个警告一个/dev/vda1 一个rootfs
[RESOLVED] HostFilesystem Usage
Alerts Resolved
[WARNING] centos6
Description: 宿主机 [/] 分区使用超过80%.
Graph: $📈
Details:
alertname: HostFilesystem Usage
appname: centos6
device: /dev/vda1
fstype: ext4
instance: 172.30.171.61:9100
job: centos6_test
mountpoint: /
name: Disk
[WARNING] centos6
Description: 宿主机 [/] 分区使用超过80%.
Graph: $📈
Details:
alertname: HostFilesystem Usage
appname: centos6
device: rootfs
fstype: rootfs
instance: 172.30.171.61:9100
job: centos6_test
mountpoint: /
name: Disk
promtool检查配置文件格式是否正确
docker exec -it prometheus sh
promtool check config /opt/prometheus/prometheus.yml
docker启动prometheus
原神启动 –net=host看自己需求
web.enable-lifecycle 可以配置热重载
web.external-url 配置外网地址 不然收到告警点开是容器id:9090 类似http://e135841acce2:9090
docker run -d -p 9090:9090 \
-v /opt/prometheus/:/etc/prometheus/ \
-v "/etc/localtime:/etc/localtime" \
--name prometheus --net=host prom/prometheus:latest \
--config.file=/etc/prometheus/prometheus.yml \
--web.enable-lifecycle \
--web.config.file=/etc/prometheus/webconfig.yml \
--web.external-url='http://外网地址:9090'
触发配置重载
curl -X POST -u user1:password http://localhost:9090/-/reload
启动钉钉webhook
旧版一键启动
docker run -d -p 8060:8060 \
--name webhook1 timonwong/prometheus-webhook-dingtalk:v1.4.0 \
--ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=511cb5651xxxae2267xxxxx649c86c511579bf5dd7dbc"
prometheus-webhook-dingtalk最新版是2.1,需要配置文件启动,图省事用的v1.4.0
新版启动
可以参考文章: https://blog.csdn.net/qq_34936628/article/details/125790444
2.1新版配置文件config.yml
## Request timeout
# timeout: 5s
## Customizable templates path
# 此处进行告警模板的指定,不要时使用 --template.file= 进行指定,否则会报错,只会识别tempaltefile 而不去识别指定dingdingwebhook的 配置文件
# templates:
# - /usr/local/prometheus-webhook-dingtalk/template/*.tmp
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=f50b6f2f31f6192fcfa0f27ddceaac07f1b7b5d27c3289cf04301014fa1b386d
# secret for signature
secret: SEC5691a8d16e0fc926f81f0348d4bd4ce0cce5c207e1b71cadca79598b3905018c
新版启动
docker run -d --restart always -p 8060:8060 \
--name webhook-dingding \
-v /opt/prometheus/config.yml:/etc/prometheus/config.yml \
-v /etc/localtime:/etc/localtime \
timonwong/prometheus-webhook-dingtalk
docker启动alertmanager
alertmanager.yml
/opt/alertmanager/alertmanager.yml
url为上面webhook1容器地址
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 1m
repeat_interval: 2m
receiver: webhook1
receivers:
- name: 'webhook1'
webhook_configs:
- url: 'http://172.30.171.61:8060/dingtalk/webhook1/send'
send_resolved: true # 表示服务恢复后会收到恢复告警
# 当已经发送的告警通知匹配到target_match和target_match_re规则,当有新的告警规则如果满足source_match或者定义的匹配规则,并且已发送的告警与新产生的告警中equal定义的标签完全相同,则启动抑制机制,新的告警不会发送
webconfig.yml 使用上面的那个就行
cp /opt/prometheus/webconfig.yml /opt/alertmanager/
启动
web.external-url 配置外网地址 不然收到告警点开是容器id:9093 类似http://e135841acce2:9093
docker run -d -p 9093:9093 --name alertmanager \
-v /opt/alertmanager/:/etc/alertmanager/ \
prom/alertmanager:latest \
--config.file=/etc/alertmanager/alertmanager.yml \
--web.config.file=/etc/alertmanager/webconfig.yml \
--web.external-url='http://外网地址:9093'
重载配置
如果修改了配置文件
curl -X POST -u user1:password http://localhost:9093/-/reload
安装node-exporter
https://github.com/prometheus/node_exporter/releases/
下载最新的
例如 https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
cd /usr/local/
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
tar xf node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/
mv node_exporter-1.7.0.linux-amd64/ node_exporter
cd node_exporter
sudo chmod -R 777 node_exporter
nohup /usr/local/node_exporter/node_exporter >/dev/null 2>&1 &
添加成开机启动项可以参考晚上教程
centos6 可以参考https://zhuanlan.zhihu.com/p/670516751
安装grafana
mkdir /opt/grafana-storage && chmod 777 -R /opt/grafana-storage
docker run -d -p 3000:3000 --name=grafana -v /opt/grafana-storage:/var/lib/grafana grafana/grafana
添加prometheus数据源
本机ip:3000 登陆grafana
connections>>datasources>>Add data source
选prometheus
Authentication选basic Authentiction 输入前面的prometheus配置的账号密码
save&test
导入模板
打开 serverIP:3000/dashboard/import
主机监控展示看板Node-exporter导入 8919 模板
容器监控展示看板cadvisor-exporter导入193 模板
应用监控展示看板jmx-exporter导入8563 模板
Redis监控展示看板Redis-exporter导入2751 模板
进程监控展示看板Process-exporter导入249 模板
图文可以参考https://www.jianshu.com/p/dde0dc1761ec
测试
可以编辑前面的/opt/prometheus/first_rules.yml (这个是宿主机路径)
进入容器是/etc/prometheus/first_rules.yml
90改成10 或者1
expr: (node_filesystem_size_bytes{device=~"/dev/.*"} - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 10
重载
curl -X POST -u user1:password http://localhost:9090/-/reload
过个几分钟看下钉钉有没收到告警
近期评论