Monitor Examples - fleXRPL/datadog-monitor-deployer GitHub Wiki

Monitor Examples

This page provides examples of different types of monitors that can be created using the DataDog Monitor Deployer.

Metric Monitors

CPU Usage Monitor

monitors:
  - name: "High CPU Usage Alert"
    type: "metric alert"
    query: "avg(last_5m):avg:system.cpu.user{*} > 80"
    message: |
      CPU usage is above 80%

      {{#is_alert}}
      System is experiencing high CPU usage.
      Please investigate immediately.
      {{/is_alert}}

      {{#is_recovery}}
      CPU usage has returned to normal levels.
      {{/is_recovery}}
    tags:
      - "env:production"
      - "service:web"
    options:
      notify_no_data: true
      evaluation_delay: 900
      thresholds:
        critical: 80
        warning: 70

Memory Usage Monitor

monitors:
  - name: "Memory Usage Alert"
    type: "metric alert"
    query: "avg(last_5m):avg:system.mem.used{*} / avg:system.mem.total{*} * 100 > 90"
    message: |
      Memory usage is above 90%

      {{#is_alert}}
      Available Memory: {{value}}%
      Please check for memory leaks or increase capacity.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:app"
    options:
      thresholds:
        critical: 90
        warning: 80

Log Monitors

Error Rate Monitor

monitors:
  - name: "High Error Rate Alert"
    type: "log alert"
    query: 'logs("status:error").index("*").rollup("count").last("5m") > 100'
    message: |
      High number of errors detected

      {{#is_alert}}
      Error count: {{value}}
      Please check the logs for more details.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:api"
    options:
      thresholds:
        critical: 100
        warning: 50

Missing Logs Monitor

monitors:
  - name: "Missing Logs Alert"
    type: "log alert"
    query: 'logs("*").index("main").rollup("count").last("10m") < 1'
    message: |
      No logs received in the last 10 minutes

      {{#is_alert}}
      Please check if the logging service is running.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:logging"
    options:
      notify_no_data: true
      no_data_timeframe: 10

APM Monitors

Latency Monitor

monitors:
  - name: "High Latency Alert"
    type: "query alert"
    query: "avg(last_5m):avg:trace.http.request.duration{env:production} > 2"
    message: |
      High API latency detected

      {{#is_alert}}
      Average response time: {{value}}s
      Please check the API performance.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:api"
    options:
      thresholds:
        critical: 2
        warning: 1

Error Rate Monitor

monitors:
  - name: "API Error Rate Alert"
    type: "query alert"
    query: "sum(last_5m):sum:trace.servlet.request.errors{env:production} / sum:trace.servlet.request.hits{env:production} * 100 > 5"
    message: |
      High API error rate detected

      {{#is_alert}}
      Error rate: {{value}}%
      Please investigate API issues.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:api"

Process Monitors

Process Check

monitors:
  - name: "Process Check Alert"
    type: "process alert"
    query: 'processes("nginx").over("*").rollup("count").last("5m") < 1'
    message: |
      Nginx process not running

      {{#is_alert}}
      No Nginx processes found.
      Please check the service status.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:web"
    options:
      notify_no_data: true
      no_data_timeframe: 10

Network Monitors

Network Traffic Monitor

monitors:
  - name: "High Network Traffic Alert"
    type: "metric alert"
    query: "avg(last_5m):avg:system.net.bytes_rcvd{*} > 1000000000"
    message: |
      High network traffic detected

      {{#is_alert}}
      Current traffic: {{value}} bytes
      Please investigate network usage.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:network"

Composite Monitors

Service Health Monitor

monitors:
  - name: "Service Health Alert"
    type: "composite"
    query: "12345 && 67890" # Reference existing monitor IDs
    message: |
      Service health check failed

      {{#is_alert}}
      Both CPU and Memory thresholds exceeded.
      Critical service degradation detected.
      {{/is_alert}}
    tags:
      - "env:production"
      - "service:critical"

Event Monitors

Deployment Monitor

monitors:
  - name: "Failed Deployment Alert"
    type: "event alert"
    query: 'events("deployment").rollup("count").by("status").last("5m").filter("status:failed") > 0'
    message: |
      Deployment failure detected

      {{#is_alert}}
      Please check deployment logs and rollback if necessary.
      {{/is_alert}}
    tags:
      - "env:production"
      - "team:devops"

Additional Resources