docker部署启动prometheus+alertmanager+grafana+钉钉告警并增加密码验证配置文件

内容目录

环境

一台老centos6 docker1.7.1
没法docker-compose

密码加密生成

vim encrypt.py

import getpass
import bcrypt

password = getpass.getpass("password: ")
hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt())
print(hashed_password.decode())

然后生成加密

[root@i ~]# python3 encrypt.py 
password: 
$2b$12$WE7uLSpQJt8lSbiJE9VWOO7TMfsKzEahMSZB8DhiSrE.KkIaaTXNe

创建prometheus配置文件

mkdir -p /opt/prometheus
cd /opt/prometheus
#创建 webconfig.yml prometheus.yml
[root@01 prometheus]# cat webconfig.yml 
basic_auth_users:
  user1: $2b$12$WE7uLSpQJt8lSbiJE9VWOO7TMfsKzEahMSZB8DhiSrE.KkIaaTXNe
[root@01 prometheus]# cat prometheus.yml 
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['alertmanagerIP:9093']
    basic_auth:
      username: user1
      password: password

      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
#可以相对路径也可以绝对路径
rule_files:
  - "/etc/prometheus/first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label  to any timeseries scraped from this config.
  - job_name: 'prometheus'
  #明文密码 这个如果不配置 Targets页面prometheus显示不健康 401 Unauthorized
    basic_auth:
      username: user1
      password: password
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
    #监听的地址
    - targets: ['localhost:9090']
    #监控的本机 已经安装node-exporter
  - job_name: 'centos6_test'
    static_configs:
      - targets: ['localhost:9100']  #node-exporter地址
        labels:
          appname: 'centos6' #添加的标签

要是上面复制出来编辑yaml格式不对 让openai 文心一言帮忙校准格式
发送后说 将上面的yaml文件转为正确的格式

规则rule文件示例

groups:
- name: Host
  rules:
  - alert: HostMemory Usage
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 >  90
    for: 1m
    labels:
      name: Memory
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机内存使用率超过90%."
      value: "{{ $value }}"
  - alert: HostCPU Usage
    expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
    for: 1m
    labels:
      name: CPU
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机CPU使用率超过80%."
      value: "{{ $value }}"
  - alert: HostLoad
    expr: node_load5 > 20
    for: 1m
    labels:
      name: Load
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "主机负载5分钟超过20."
      value: "{{ $value }}"
  - alert: HostFilesystem Usage
    expr: (node_filesystem_size_bytes{device=~"/dev/.*"} - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
    for: 1m
    labels:
      name: Disk
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机 [{{ $labels.mountpoint }}] 分区使用超90%."
      value: "{{ $value }}%"
  - alert: HostDiskio writes
    expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机 [{{ $labels.device }}] 磁盘1分钟平均写入IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostDiskio reads
    expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿机 [{{ $labels.device }}] 磁盘1分钟平均读取IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostNetwork_receive
    expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
    for: 1m
    labels:
      name: Network_receive
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
      value: "{{ $value }}3Mbps"
  - alert: HostNetwork_transmit
    expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
    for: 1m
    labels:
      name: Network_transmit
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }}"
      description: "宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
      value: "{{ $value }}3Mbps"

node_filesystem_size_bytes{device=~"/dev/.*"} 防止查询出两个警告一个/dev/vda1 一个rootfs

[RESOLVED] HostFilesystem Usage
Alerts Resolved
[WARNING] centos6
Description: 宿主机 [/] 分区使用超过80%.
Graph: $📈
Details:

alertname: HostFilesystem Usage
appname: centos6
device: /dev/vda1
fstype: ext4
instance: 172.30.171.61:9100
job: centos6_test
mountpoint: /
name: Disk

[WARNING] centos6
Description: 宿主机 [/] 分区使用超过80%.
Graph: $📈
Details:

alertname: HostFilesystem Usage
appname: centos6
device: rootfs
fstype: rootfs
instance: 172.30.171.61:9100
job: centos6_test
mountpoint: /
name: Disk

promtool检查配置文件格式是否正确

docker exec -it prometheus sh
promtool check config /opt/prometheus/prometheus.yml

docker启动prometheus

原神启动 –net=host看自己需求
web.enable-lifecycle 可以配置热重载
web.external-url 配置外网地址 不然收到告警点开是容器id:9090 类似http://e135841acce2:9090

docker run -d -p 9090:9090 \
-v  /opt/prometheus/:/etc/prometheus/  \
-v "/etc/localtime:/etc/localtime"  \
--name prometheus --net=host prom/prometheus:latest  \
--config.file=/etc/prometheus/prometheus.yml  \
--web.enable-lifecycle  \
--web.config.file=/etc/prometheus/webconfig.yml \
--web.external-url='http://外网地址:9090'

触发配置重载

curl -X POST -u user1:password http://localhost:9090/-/reload

启动钉钉webhook

旧版一键启动

docker run -d -p 8060:8060 \
--name webhook1  timonwong/prometheus-webhook-dingtalk:v1.4.0 \
--ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=511cb5651xxxae2267xxxxx649c86c511579bf5dd7dbc" 

prometheus-webhook-dingtalk最新版是2.1,需要配置文件启动,图省事用的v1.4.0

新版启动

可以参考文章: https://blog.csdn.net/qq_34936628/article/details/125790444

2.1新版配置文件config.yml

## Request timeout
# timeout: 5s

## Customizable templates path
#    此处进行告警模板的指定,不要时使用 --template.file= 进行指定,否则会报错,只会识别tempaltefile 而不去识别指定dingdingwebhook的 配置文件
# templates:
#    - /usr/local/prometheus-webhook-dingtalk/template/*.tmp
targets:
    webhook1:
        url: https://oapi.dingtalk.com/robot/send?access_token=f50b6f2f31f6192fcfa0f27ddceaac07f1b7b5d27c3289cf04301014fa1b386d
        # secret for signature
        secret: SEC5691a8d16e0fc926f81f0348d4bd4ce0cce5c207e1b71cadca79598b3905018c

新版启动

 docker run -d --restart always -p 8060:8060  \
--name webhook-dingding  \
-v /opt/prometheus/config.yml:/etc/prometheus/config.yml  \
-v /etc/localtime:/etc/localtime  \
 timonwong/prometheus-webhook-dingtalk

docker启动alertmanager

alertmanager.yml

/opt/alertmanager/alertmanager.yml
url为上面webhook1容器地址

global:
    resolve_timeout: 5m
route:
    group_by: ['alertname']
    group_wait: 30s
    group_interval: 1m
    repeat_interval: 2m
    receiver: webhook1
receivers:
  - name: 'webhook1'
    webhook_configs:
      - url: 'http://172.30.171.61:8060/dingtalk/webhook1/send'
        send_resolved: true     # 表示服务恢复后会收到恢复告警
#    当已经发送的告警通知匹配到target_match和target_match_re规则,当有新的告警规则如果满足source_match或者定义的匹配规则,并且已发送的告警与新产生的告警中equal定义的标签完全相同,则启动抑制机制,新的告警不会发送    

webconfig.yml 使用上面的那个就行

cp /opt/prometheus/webconfig.yml  /opt/alertmanager/

启动

web.external-url 配置外网地址 不然收到告警点开是容器id:9093 类似http://e135841acce2:9093

docker run -d -p 9093:9093 --name alertmanager \
 -v /opt/alertmanager/:/etc/alertmanager/ \
 prom/alertmanager:latest \
 --config.file=/etc/alertmanager/alertmanager.yml \
 --web.config.file=/etc/alertmanager/webconfig.yml \
 --web.external-url='http://外网地址:9093'

重载配置

如果修改了配置文件

curl -X POST -u user1:password http://localhost:9093/-/reload

安装node-exporter

https://github.com/prometheus/node_exporter/releases/
下载最新的
例如 https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz

cd /usr/local/ 
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
tar xf node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/
mv node_exporter-1.7.0.linux-amd64/ node_exporter
cd node_exporter
sudo chmod -R 777 node_exporter
nohup /usr/local/node_exporter/node_exporter >/dev/null 2>&1 &

添加成开机启动项可以参考晚上教程
centos6 可以参考https://zhuanlan.zhihu.com/p/670516751

安装grafana

mkdir /opt/grafana-storage && chmod 777 -R /opt/grafana-storage
docker run -d   -p 3000:3000   --name=grafana   -v /opt/grafana-storage:/var/lib/grafana   grafana/grafana

添加prometheus数据源

本机ip:3000 登陆grafana
connections>>datasources>>Add data source
选prometheus
Authentication选basic Authentiction 输入前面的prometheus配置的账号密码
save&test

导入模板

打开 serverIP:3000/dashboard/import
主机监控展示看板Node-exporter导入 8919 模板
容器监控展示看板cadvisor-exporter导入193 模板
应用监控展示看板jmx-exporter导入8563 模板
Redis监控展示看板Redis-exporter导入2751 模板
进程监控展示看板Process-exporter导入249 模板

图文可以参考https://www.jianshu.com/p/dde0dc1761ec

测试

可以编辑前面的/opt/prometheus/first_rules.yml (这个是宿主机路径)
进入容器是/etc/prometheus/first_rules.yml
90改成10 或者1

 expr: (node_filesystem_size_bytes{device=~"/dev/.*"} - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 10

重载

curl -X POST -u user1:password http://localhost:9090/-/reload

过个几分钟看下钉钉有没收到告警

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注