Prometheus rules for Telegraf - krreddy123/Prometheus GitHub Wiki

Prometheus rules for Telegraf

  • name: WINDOWS POD

    rules:

    • alert: Windows pod CPU usage

      expr: (avg(irate(win_proc_Percent_Processor_Time[5m])) by (pod) / 4) * 100 > 80 # where 4 - mumber of CPU core

      for: 5m

      labels:

      severity: critical

      annotations:

      title: Windows pod CPU usage great then 80%.

      description: Windows pod {{ $labels.pod }} CPU usage on node great then 80%.

    • alert: Windows pod RAM usage

      expr: (sum( win_proc_Working_Set ) by (pod, namespace) / sum( kube_pod_container_resource_limits_memory_bytes ) by (pod, namespace) ) * 100 > 90

      for: 5m

      labels:

      severity: critical

      annotations:

      title: Windows pod RAM usage great then 90 % of it's limit lasts 5 minutes

      description: "Windows pod {{ $labels.pod }} RAM usage great then 90 % of it's limit lasts 5 minutes."

    • alert: Windows pod IIS thread count

      expr: sum(win_proc_Thread_Count{exported_instance=~"w3wp"}) by (host) > 300

      for: 5m

      labels:

      severity: critical

      annotations:

      title: Windows pod monitoring detected more then 300 threads lasts 5 minutes for application IIS w3svc process

      description: Windows pod {{ $labels.pod }} monitoring detected more then 300 threads lasts 5 minutes for IIS w3svc process

    • alert: Windows pod thread count

      expr: sum(win_proc_Thread_Count) by (host) > 800

      for: 5m

      labels:

      severity: critical

      annotations:

      title: Windows pod monitoring detected more then 800 threads lasts 5 minutes

      description: Windows pod {{ $labels.pod }} monitoring detected more then 800 threads lasts 5 minutes