prometheus 监控实战篇

Posted on 2023年9月28日2023年9月28日 by hackdl

prometheus 监控

prometheus 监控
- 1.上传tar包
- 2.解压到对应文件夹
- 3.配置开机自启动
- 4.配置Prometheus
- 5.black_exporter 监控网站状态
- 6.alertmanager监控规则路径
  - 6.1目录结构
  - 6.2监控ceph
  - 6.3监控jenkins
  - 6.4监控postgresql
  - 6.5发送邮件模板

1.上传tar包

[root@promethes prometheus]# cd /opt
[root@promethes opt]# ll
total 131628
-rw-r--r--. 1 root root 29254678 Sep  6 17:36 alertmanager-0.25.0.linux-amd64.tar.gz
-rw-r--r--. 1 root root 10649117 Aug 22 10:58 blackbox_exporter-0.22.0.linux-amd64.tar.gz
-rw-r--r--. 1 root root 94876162 Aug 22 14:45 prometheus-2.46.0.linux-amd64.tar.gz
[root@promethes opt]#

2.解压到对应文件夹

[root@promethes opt]# tar xf alertmanager-0.25.0.linux-amd64.tar.gz  -C /usr/local/
[root@promethes local]# ll
total 92660
drwxr-xr-x. 4 3434 3434      123 Sep  6 17:51 alertmanager
drwxr-xr-x. 3 root root       19 Aug 25 14:48 alertmanager_bak
drwxr-xr-x. 2 root root        6 May 16  2022 bin
lrwxrwxrwx. 1 root root       37 Aug 22 11:01 blackbox_exporter -> blackbox_exporter-0.22.0.linux-amd64/
drwxr-xr-x. 2 3434 3434      104 Sep  6 15:26 blackbox_exporter-0.22.0.linux-amd64
drwxr-xr-x. 2 root root        6 May 16  2022 etc
drwxr-xr-x. 2 root root        6 May 16  2022 games
drwxr-xr-x. 2 root root        6 服务器托管网May 16  2022 include
drwxr-xr-x. 2 root root        6 May 16  2022 lib
drwxr-xr-x. 3 root root       17 Aug 10 10:42 lib64
drwxr-xr-x. 2 root root        6 May 16  2022 libexec
drwxr-xr-x. 5 root root      145 Sep  7 11:10 prometheus
drwxr-xr-x. 5 root root      171 Sep  6 16:37 prometheus_bak
-rw-r--r--. 1 root root 94879997 Sep  6 16:33 prometheus.tar
drwxr-xr-x. 2 root root        6 May 16  2022 sbin
drwxr-xr-x. 5 root root       49 Aug 10 10:42 share
drwxr-xr-x. 2 root root        6 May 16  2022 src
[root@promethes local]#

3.配置开机自启动

[root@promethes system]# pwd 
/usr/lib/systemd/system
[root@promethes system]# cat prometheus.service 
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure 
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml 
[Install]
WantedBy=multi-user.target

[root@promethes system]# cat alertmanager.service 
[Unit]
Descriptinotallow=AlertManager Server S服务器托管网ervice daemon
#Wants=network-online.target
#After=network-online.target

[Service]
#User=root
#Group=root
Type=Simple
#Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager 
    --config.file "/usr/local/alertmanager/alertmanager.yml" 
#    --storage.path="/usr/local/alertmanager/data" 
#    --data.retentinotallow=120h 
#    --alerts.gc-interval=30m 
#    --web.external-url "http://172.30.3.23:9093"
#    --web.listen-address=":9093"

[Install]
WantedBy=multi-user.target

[root@promethes system]#

[root@promethes system]# cat blackbox_exporter.service 
[Unit]
Description=blackbox_exporter
After=network.target

[Service]
User=root
Type=simple
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target
[root@promethes system]#

4.配置Prometheus

[root@promethes prometheus]# pwd 
/usr/local/prometheus
[root@promethes prometheus]# ll
total 236276
drwxr-xr-x. 2 root root        38 Jul 25 21:06 console_libraries
drwxr-xr-x. 2 root root       173 Jul 25 21:06 consoles
drwxr-xr-x. 2 root root        26 Sep  6 16:56 etc.d
-rw-r--r--. 1 root root     11357 Jul 25 21:06 LICENSE
-rw-r--r--. 1 root root      3773 Jul 25 21:06 NOTICE
-rwxr-xr-x. 1 root root 123611355 Jul 25 20:34 prometheus
-rw-r--r--. 1 root root      2749 Sep  7 11:10 prometheus.yml
-rwxr-xr-x. 1 root root 118310964 Jul 25 20:36 promtool

[root@promethes prometheus]# cat prometheus.yml 
# my global config
global:
  scrape_interval:     60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
    monitor: '？jivest'

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
        - 172.30.3.22:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
#监控规则路径
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  #- rules.yml
  - "/usr/local/alertmanager/rules/*.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9090']
   # 网站监控
  - job_name: 'http_status'
    metrics_path: /probe
    params:
      module: [http_2xx]     # 在black_exporter中定义的模块名
    file_sd_configs:         # 因需要监控的地址很多，我们这里将所有地址独立出来
      - files: 
        - '/usr/local/prometheus/etc.d/job_web.yaml' #监控网站地址
        refresh_interval: 15s
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance        
      - target_label: __address__
        replacement: 172.30.3.22:9115

  - job_name: node
    # grab stats about the local machine by default.
    static_configs:
      - targets:
          - 172.30.3.？:9099
          - 172.30.3.17？:9099
          - 172.30.3.19？:9099



[root@promethes prometheus]#

5.black_exporter 监控网站状态

[root@promethes blackbox_exporter]# ll
total 20284
-rwxr-xr-x. 1 3434 3434 20745692 Aug  2  2022 blackbox_exporter
-rw-r--r--. 1 3434 3434     1503 Aug 22 16:24 blackbox.yml
-rw-r--r--. 1 root root      910 Aug 22 16:21 blackbox.yml.bak
-rw-r--r--. 1 3434 3434    11357 Aug  2  2022 LICENSE
-rw-r--r--. 1 3434 3434       94 Aug  2  2022 NOTICE
[root@promethes blackbox_exporter]# pwd 
/usr/local/blackbox_exporter
[root@promethes blackbox_exporter]# cat blackbox.yml
modules:
  http_2xx:
    prober: http
    timeout: 5s  # 探针检测超时时间
    http:
      valid_status_codes: [] # 有效的状态码，默认为200，也可以自己定义，比如你的站点304也可能是正常的
      method: GET            # http使用get请求
      fail_if_body_not_matches_regexp: [] # 对返回结果进行正则匹配，如果未匹配成功则认为失败
      tls_config:
        insecure_skip_verify: true        # 不安全的https跳过确认，如某些证书不合法或者过期，如果你在浏览器访问，那浏览器会让你确认是否继续，这里也是类似的意思。
  http_post_2xx:
    prober: http
    http:
      method: POST
  tcp_connect:
    prober: tcp
  pop3s_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^+OK"
      tls: true
      tls_config:
        insecure_skip_verify: false
  grpc:
    prober: grpc
    grpc:
      tls: true
      preferred_ip_protocol: "ip4"
  grpc_plain:
    prober: grpc
    grpc:
      tls: false
      service: "service1"
  ssh_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^SSH-2.0-"
      - send: "SSH-2.0-blackbox-ssh-check"
  irc_banner:
    prober: tcp
    tcp:
      query_response:
      - send: "NICK prober"
      - send: "USER prober prober prober :prober"
      - expect: "PING :([^ ]+)"
        send: "PONG ${1}"
      - expect: "^:[^ ]+ 001"
  icmp:
    prober: icmp
  icmp_ttl5:
    prober: icmp
    timeout: 5s
    icmp:
      ttl: 5
[root@promethes blackbox_exporter]#

6.alertmanager监控规则路径

6.1目录结构

[root@promethes alertmanager]# ll
total 61016
-rwxr-xr-x. 1 3434 3434 34546840 Dec 22  2022 alertmanager
-rw-r--r--. 1 3434 3434     2074 Sep  6 17:51 alertmanager.yml
-rwxr-xr-x. 1 3434 3434 27906085 Dec 22  2022 amtool
-rw-r--r--. 1 3434 3434    11357 Dec 22  2022 LICENSE
-rw-r--r--. 1 3434 3434      457 Dec 22  2022 NOTICE
drwxr-xr-x. 2 root root      103 Sep  7 15:41 rules
drwxr-xr-x. 2 root root       24 Sep  6 17:43 templates
[root@promethes alertmanager]# pwd
/usr/local/alertmanager

[root@promethes alertmanager]# cat alertmanager.yml 
# Sample configuration.
# See https://prometheus.io/docs/alerting/configuration/ for documentation.

global:
  # The smarthost and SMTP sender used for mail notifications.
  smtp_smarthost: 'smtp.exmail.qq.com:587'
  smtp_from: '*@mails.*jivest.com'
  smtp_hello: '*jivest.com'
  smtp_auth_username: '*@mails.youjivest.com'
  smtp_auth_password: '？？？？？？？？'
# The directory from which notification templates are read.
templates: 
- '/usr/local/alertmanager/templates/*.tmpl'

# The root route on which each incoming alert enters.
route:
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  group_by: ['alertname', 'cluster', 'service']

  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first 
  # notification.
  group_wait: 30s

  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  group_interval: 5m

  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 3h

  # A default receiver
  receiver: team-X

  # All the above attributes are inherited by all child routes and can 
  # overwritten on each.

  # The child route trees.
  routes:

# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is 
# already critical.
inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  # Apply inhibition if the alertname is the same.
  equal: ['alertname', 'cluster', 'service']


receivers:
- name: 'team-X'
  email_configs:
  - to: 'chenhu@youjivest.com'
#  wechat_configs:
#  - agent_id: '1000002'
#    to_user: '@all'

[root@promethes alertmanager]#

告警模板大全

[root@promethes prometheus]# cd /usr/local/alertmanager/rules
[root@promethes rules]# ll
total 4
-rw-r--r--. 1 root root 3535 Sep  6 17:42 rules.yml
[root@promethes rules]# cat rules.yml 
groups:
- name: 系统盘空间
  rules:
  - alert: node_filesystem_avail_bytes
    expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}

6.2监控ceph

[root@promethes rules]# cat ceph-exporter.yml 
groups:

- name: EmbeddedExporter

  rules:

    - alert: CephState
      expr: 'ceph_health_status != 0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Ceph State (instance {{ $labels.instance }})
        description: "Ceph instance unhealthyn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephMonitorClockSkew
      expr: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph monitor clock skew (instance {{ $labels.instance }})
        description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settingsn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephMonitorLowSpace
      expr: 'ceph_monitor_avail_percent  5'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Ceph high OSD latency (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephOsdLowSpace
      expr: 'ceph_osd_utilization > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph OSD low space (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon is going out of space. Please add more disks.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephOsdReweighted
      expr: 'ceph_osd_weight  0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG down (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are down. Please ensure that all the data are available.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephPgIncomplete
      expr: 'ceph_pg_incomplete > 0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG incomplete (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephPgInconsistent
      expr: 'ceph_pg_inconsistent > 0'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG inconsistent (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephPgActivationLong
      expr: 'ceph_pg_activating > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG activation long (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are too long to activate.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephPgBackfillFull
      expr: 'ceph_pg_backfill_toofull > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG backfill full (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: CephPgUnavailable
      expr: 'ceph_pg_total - ceph_pg_active > 0'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG unavailable (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are unavailable.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"
[root@promethes rules]#

6.3监控jenkins

[root@promethes rules]# cat jenkins-plugin.yml 
groups:

- name: MetricPlugin

  rules:

    - alert: JenkinsOffline
      expr: 'jenkins_node_offline_value > 1'
      for: 30m
      labels:
        severity: critical
      annotations:
        summary: Jenkins offline (instance {{ $labels.instance }})
        description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: JenkinsHealthcheck
      expr: 'jenkins_health_check_score  3'
      for: 1d
      labels:
        severity: warning
      annotations:
        summary: Jenkins outdated plugins (instance {{ $labels.instance }})
        description: "{{ $value }} plugins need updaten  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: JenkinsBuildsHealthScore
      expr: 'default_jenkins_builds_health_score  100'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: Jenkins run failure total (instance {{ $labels.instance }})
        description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: JenkinsBuildTestsFailing
      expr: 'default_jenkins_builds_last_build_tests_failing > 0'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: Jenkins build tests failing (instance {{ $labels.instance }})
        description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: JenkinsLastBuildFailed
      expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: Jenkins last build failed (instance {{ $labels.instance }})
        description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"
[root@promethes rules]#

6.4监控postgresql

[root@promethes rules]# cat postgres-exporter.yml 
groups:

- name: PostgresExporter

  rules:

    - alert: PostgresqlDown
      expr: 'pg_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql down (instance {{ $labels.instance }})
        description: "Postgresql instance is downn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlRestarted
      expr: 'time() - pg_postmaster_start_time_seconds  0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql exporter error (instance {{ $labels.instance }})
        description: "Postgresql exporter is showing errors. A query may be buggy in query.yamln  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoVacuumed
      expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 daysn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoAnalyzed
      expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto analyzed for 10 daysn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyConnections
      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance has too many connections (> 80%).n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlNotEnoughConnections
      expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})  5)n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlDeadLocks
      expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql dead locks (instance {{ $labels.instance }})
        description: "PostgreSQL has dead-locksn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlHighRollbackRate
      expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql high rollback rate (instance {{ $labels.instance }})
        description: "Ratio of transactions being aborted compared to committed is > 2 %n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlCommitRateLow
      expr: 'rate(pg_stat_database_xact_commit[1m])  3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
        description: "Postgres transactions showing high rate of statement timeoutsn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlHighRateDeadlock
      expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
        description: "Postgres detected deadlocksn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlUnusedReplicationSlot
      expr: 'pg_replication_slots_active == 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Postgresql unused replication slot (instance {{ $labels.instance }})
        description: "Unused Replication Slotsn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyDeadTuples
      expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
        description: "PostgreSQL dead tuples is too largen  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlConfigurationChanged
      expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Postgresql configuration changed (instance {{ $labels.instance }})
        description: "Postgres Database configuration change has occurredn  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlSslCompressionActive
      expr: 'sum(pg_stat_ssl_compression) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql SSL compression active (instance {{ $labels.instance }})
        description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyLocksAcquired
      expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
        description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlBloatIndexHigh(>80%)
      expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
        description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"

    - alert: PostgresqlBloatTableHigh(>80%)
      expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`n  VALUE = {{ $value }}n  LABELS = {{ $labels }}"
[root@promethes rules]#

6.5发送邮件模板

[root@promethes templates]# cat email.tmpl 
{{ define "test.html" }}   
{{ range .Alerts }}
=========start==========

告警程序: {{ .Labels.job }} 

告警级别: {{ .Labels.severity }} 级 

告警类型: {{ .Labels.alertname }} 

故障主机: {{ .Labels.instance }} 

告警主题: {{ .Annotations.summary }} 

触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} 

=========end==========

{{ end }}
{{ end }}

[root@promethes templates]#

服务器托管，北京服务器托管，服务器租用 http://www.fwqtg.net
机房租用，北京机房租用，IDC机房托管， http://www.fwqtg.net

相关推荐: flutter系列之:做一个图像滤镜

目录简介我们的目标带滤镜的图片打造filter按钮打造可滑动按钮最后要解决的问题简介很多时候，我们需要一些特效功能，比如给图片做个滤镜什么的，如果是h5页面，那么我们可以很容易的通过css滤镜来实现这个功能。那么如果在flutter中，如果要…

服务器托管，北京服务器托管，服务器租用，机房机柜带宽租用

服务器托管

咨询：董先生

电话13051898268 QQ/微信93663045！

上一篇: RDIFramework.NET ━ .NET快速信息化系统开发框架记录所有操作的Sql
下一篇: 【深度学习】3-3 神经网络的学习- 导数&梯度