Monitoring & Logging Setup

#AutoSDLC #Monitoring #Logging #Observability

Overview

The Monitoring & Logging Setup provides comprehensive observability for the AutoSDLC system, including agent performance tracking, system health monitoring, distributed tracing, and centralized logging. This guide covers the implementation of monitoring infrastructure for both development and production environments.

Architecture

Monitoring Stack

graph TB
    subgraph "Data Sources"
        AG[Agents]
        API[API Servers]
        MCP[MCP Servers]
        DB[Databases]
        K8S[Kubernetes]
    end
    
    subgraph "Collection Layer"
        PROM[Prometheus]
        LOKI[Loki]
        TEMPO[Tempo]
        OT[OpenTelemetry]
    end
    
    subgraph "Storage"
        TSDB[(Time Series DB)]
        LOGS[(Log Storage)]
        TRACES[(Trace Storage)]
    end
    
    subgraph "Visualization"
        GRAF[Grafana]
        ALERT[Alertmanager]
        DASH[Dashboards]
    end
    
    AG & API & MCP & DB & K8S --> OT
    OT --> PROM & LOKI & TEMPO
    PROM --> TSDB
    LOKI --> LOGS
    TEMPO --> TRACES
    TSDB & LOGS & TRACES --> GRAF
    PROM --> ALERT
    GRAF --> DASH

Key Components

monitoring_stack:
  metrics:
    prometheus:
      version: "2.45.0"
      retention: "30d"
      scrape_interval: "15s"
      
  logging:
    loki:
      version: "2.9.0"
      retention: "90d"
      ingestion_rate: "10MB/s"
      
  tracing:
    tempo:
      version: "2.3.0"
      retention: "7d"
      
  visualization:
    grafana:
      version: "10.2.0"
      dashboards:
        - system-overview
        - agent-performance
        - api-metrics
        - error-analysis
        
  alerting:
    alertmanager:
      version: "0.26.0"
      receivers:
        - pagerduty
        - slack
        - email

Metrics Collection

Agent Metrics

// agents/base/metrics.ts
import { register, Counter, Histogram, Gauge } from 'prom-client';

export class AgentMetrics {
  // Task metrics
  private tasksReceived = new Counter({
    name: 'agent_tasks_received_total',
    help: 'Total number of tasks received',
    labelNames: ['agent_type', 'task_type']
  });
  
  private tasksCompleted = new Counter({
    name: 'agent_tasks_completed_total',
    help: 'Total number of tasks completed',
    labelNames: ['agent_type', 'task_type', 'status']
  });
  
  private taskDuration = new Histogram({
    name: 'agent_task_duration_seconds',
    help: 'Task execution duration in seconds',
    labelNames: ['agent_type', 'task_type'],
    buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 300, 600]
  });
  
  // TDD metrics
  private testsRun = new Counter({
    name: 'agent_tests_run_total',
    help: 'Total number of test runs',
    labelNames: ['agent_type', 'phase']
  });
  
  private testResults = new Gauge({
    name: 'agent_test_results',
    help: 'Test results (passing/total)',
    labelNames: ['agent_type', 'result_type']
  });
  
  private tddPhase = new Gauge({
    name: 'agent_tdd_phase',
    help: 'Current TDD phase (1=red, 2=green, 3=refactor)',
    labelNames: ['agent_type']
  });
  
  // Resource metrics
  private cpuUsage = new Gauge({
    name: 'agent_cpu_usage_percent',
    help: 'CPU usage percentage',
    labelNames: ['agent_type', 'agent_id']
  });
  
  private memoryUsage = new Gauge({
    name: 'agent_memory_usage_bytes',
    help: 'Memory usage in bytes',
    labelNames: ['agent_type', 'agent_id']
  });
  
  // Communication metrics
  private messagessSent = new Counter({
    name: 'agent_messages_sent_total',
    help: 'Total messages sent',
    labelNames: ['agent_type', 'target_agent', 'message_type']
  });
  
  private messagesReceived = new Counter({
    name: 'agent_messages_received_total',
    help: 'Total messages received',
    labelNames: ['agent_type', 'source_agent', 'message_type']
  });
  
  private messageLatency = new Histogram({
    name: 'agent_message_latency_seconds',
    help: 'Message handling latency',
    labelNames: ['agent_type', 'message_type'],
    buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
  });
  
  // Status update metrics
  private statusUpdates = new Counter({
    name: 'agent_status_updates_total',
    help: 'Total status updates written to Agent_Output.md',
    labelNames: ['agent_type']
  });
  
  private lastStatusUpdate = new Gauge({
    name: 'agent_last_status_update_timestamp',
    help: 'Timestamp of last status update',
    labelNames: ['agent_type', 'agent_id']
  });
  
  recordTaskReceived(agentType: string, taskType: string): void {
    this.tasksReceived.inc({ agent_type: agentType, task_type: taskType });
  }
  
  recordTaskCompleted(
    agentType: string,
    taskType: string,
    status: 'success' | 'failure',
    duration: number
  ): void {
    this.tasksCompleted.inc({ 
      agent_type: agentType, 
      task_type: taskType, 
      status 
    });
    
    this.taskDuration.observe(
      { agent_type: agentType, task_type: taskType },
      duration
    );
  }
  
  recordTDDPhase(agentType: string, phase: 'red' | 'green' | 'refactor'): void {
    const phaseMap = { red: 1, green: 2, refactor: 3 };
    this.tddPhase.set({ agent_type: agentType }, phaseMap[phase]);
  }
  
  recordTestResults(
    agentType: string,
    passing: number,
    total: number
  ): void {
    this.testResults.set({ agent_type: agentType, result_type: 'passing' }, passing);
    this.testResults.set({ agent_type: agentType, result_type: 'total' }, total);
  }
}

System Metrics

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  # Agent metrics
  - job_name: 'autosdlc-agents'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - autosdlc-agents
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - source_labels: [__meta_kubernetes_pod_label_agent_type]
        action: replace
        target_label: agent_type
        
  # MCP Server metrics
  - job_name: 'mcp-servers'
    static_configs:
      - targets: 
          - 'mcp-server-1:9090'
          - 'mcp-server-2:9090'
          - 'mcp-server-3:9090'
          
  # Node exporter for system metrics
  - job_name: 'node-exporter'
    kubernetes_sd_configs:
      - role: node
    relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
        
  # Kubernetes metrics
  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
      - role: endpoints
    scheme: https
    tls_config:
      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

Custom Metrics

// services/MetricsCollector.ts
export class MetricsCollector {
  private customMetrics = new Map<string, any>();
  
  // TDD Compliance Metrics
  registerTDDMetrics(): void {
    this.customMetrics.set('tdd_compliance_rate', new Gauge({
      name: 'autosdlc_tdd_compliance_rate',
      help: 'Percentage of implementations following TDD',
      labelNames: ['project', 'sprint']
    }));
    
    this.customMetrics.set('tests_written_first', new Counter({
      name: 'autosdlc_tests_written_first_total',
      help: 'Number of features with tests written first',
      labelNames: ['agent', 'feature_type']
    }));
    
    this.customMetrics.set('red_phase_violations', new Counter({
      name: 'autosdlc_red_phase_violations_total',
      help: 'Number of times implementation started without all tests red',
      labelNames: ['agent', 'reason']
    }));
  }
  
  // Agent Collaboration Metrics
  registerCollaborationMetrics(): void {
    this.customMetrics.set('agent_handoffs', new Counter({
      name: 'autosdlc_agent_handoffs_total',
      help: 'Number of task handoffs between agents',
      labelNames: ['from_agent', 'to_agent', 'task_type']
    }));
    
    this.customMetrics.set('agent_wait_time', new Histogram({
      name: 'autosdlc_agent_wait_time_seconds',
      help: 'Time agents spend waiting for other agents',
      labelNames: ['waiting_agent', 'blocking_agent'],
      buckets: [1, 5, 10, 30, 60, 300, 600, 1800, 3600]
    }));
  }
  
  // Code Quality Metrics
  registerQualityMetrics(): void {
    this.customMetrics.set('code_complexity', new Gauge({
      name: 'autosdlc_code_complexity',
      help: 'Cyclomatic complexity of generated code',
      labelNames: ['agent', 'module']
    }));
    
    this.customMetrics.set('test_coverage', new Gauge({
      name: 'autosdlc_test_coverage_percent',
      help: 'Test coverage percentage',
      labelNames: ['project', 'module']
    }));
    
    this.customMetrics.set('review_iterations', new Histogram({
      name: 'autosdlc_review_iterations',
      help: 'Number of review iterations before approval',
      labelNames: ['reviewer_agent', 'coder_agent'],
      buckets: [1, 2, 3, 4, 5, 10]
    }));
  }
}

Logging Infrastructure

Structured Logging

// utils/Logger.ts
import winston from 'winston';
import { LogstashTransport } from 'winston-logstash-transport';

export class StructuredLogger {
  private logger: winston.Logger;
  
  constructor(component: string, agentType?: string) {
    this.logger = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
      ),
      defaultMeta: {
        component,
        agentType,
        environment: process.env.NODE_ENV,
        version: process.env.APP_VERSION
      },
      transports: [
        // Console transport for development
        new winston.transports.Console({
          format: winston.format.combine(
            winston.format.colorize(),
            winston.format.simple()
          )
        }),
        
        // Loki transport for production
        new LogstashTransport({
          host: process.env.LOKI_HOST || 'loki',
          port: parseInt(process.env.LOKI_PORT || '5000'),
          ssl_enable: true,
          max_connect_retries: -1,
          timeout_connect_retries: 60000
        })
      ]
    });
  }
  
  // Structured log methods
  logTaskReceived(task: Task, context: LogContext): void {
    this.logger.info('Task received', {
      event: 'task_received',
      taskId: task.id,
      taskType: task.type,
      priority: task.priority,
      ...context
    });
  }
  
  logTDDPhaseChange(phase: string, details: TDDDetails): void {
    this.logger.info('TDD phase change', {
      event: 'tdd_phase_change',
      phase,
      testsTotal: details.testsTotal,
      testsPassing: details.testsPassing,
      testsFailing: details.testsFailing,
      coverage: details.coverage,
      timestamp: new Date().toISOString()
    });
  }
  
  logAgentCommunication(comm: AgentCommunication): void {
    this.logger.info('Agent communication', {
      event: 'agent_communication',
      from: comm.from,
      to: comm.to,
      messageType: comm.messageType,
      correlationId: comm.correlationId,
      duration: comm.duration
    });
  }
  
  logError(error: Error, context: ErrorContext): void {
    this.logger.error('Error occurred', {
      event: 'error',
      errorType: error.name,
      errorMessage: error.message,
      errorStack: error.stack,
      ...context,
      timestamp: new Date().toISOString()
    });
  }
}

Log Aggregation

# loki/loki-config.yaml
auth_enabled: false

server:
  http_listen_port: 3100
  log_level: info

ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
  chunk_idle_period: 5m
  chunk_retain_period: 30s
  max_transfer_retries: 0

schema_config:
  configs:
    - from: 2025-01-01
      store: boltdb
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 168h

storage_config:
  boltdb:
    directory: /loki/index
  filesystem:
    directory: /loki/chunks

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h
  ingestion_rate_mb: 10
  ingestion_burst_size_mb: 20

chunk_store_config:
  max_look_back_period: 0s

table_manager:
  retention_deletes_enabled: true
  retention_period: 2160h # 90 days

Agent Log Patterns

// agents/base/LogPatterns.ts
export const AgentLogPatterns = {
  // Task lifecycle
  TASK_START: {
    pattern: /Task started: (\w+) \(ID: ([\w-]+)\)/,
    fields: ['taskType', 'taskId']
  },
  
  TASK_COMPLETE: {
    pattern: /Task completed: (\w+) \(ID: ([\w-]+)\) - Duration: (\d+)ms/,
    fields: ['taskType', 'taskId', 'duration']
  },
  
  // TDD patterns
  TDD_RED_VERIFIED: {
    pattern: /TDD Red phase verified: (\d+)\/(\d+) tests failing/,
    fields: ['failingTests', 'totalTests']
  },
  
  TDD_GREEN_ACHIEVED: {
    pattern: /TDD Green phase achieved: All (\d+) tests passing/,
    fields: ['totalTests']
  },
  
  TDD_REFACTOR_COMPLETE: {
    pattern: /TDD Refactor complete: (\d+) improvements made/,
    fields: ['improvements']
  },
  
  // Agent status
  STATUS_UPDATE: {
    pattern: /Agent status updated: (\w+) - File: Agent_Output\.md/,
    fields: ['status']
  },
  
  // Communication
  MESSAGE_SENT: {
    pattern: /Message sent to (\w+): Type=(\w+), ID=([\w-]+)/,
    fields: ['targetAgent', 'messageType', 'messageId']
  },
  
  MESSAGE_RECEIVED: {
    pattern: /Message received from (\w+): Type=(\w+), ID=([\w-]+)/,
    fields: ['sourceAgent', 'messageType', 'messageId']
  }
};

// Log parser for analysis
export class AgentLogParser {
  parseLogLine(line: string): ParsedLog | null {
    for (const [eventType, config] of Object.entries(AgentLogPatterns)) {
      const match = line.match(config.pattern);
      if (match) {
        const fields: any = {};
        config.fields.forEach((field, index) => {
          fields[field] = match[index + 1];
        });
        
        return {
          eventType,
          fields,
          timestamp: this.extractTimestamp(line),
          raw: line
        };
      }
    }
    return null;
  }
}

Distributed Tracing

OpenTelemetry Setup

// tracing/setup.ts
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-grpc';

export function initializeTracing(serviceName: string, agentType?: string) {
  const traceExporter = new OTLPTraceExporter({
    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4317',
  });

  const sdk = new NodeSDK({
    resource: new Resource({
      [SemanticResourceAttributes.SERVICE_NAME]: serviceName,
      [SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION || '1.0.0',
      'agent.type': agentType,
      'environment': process.env.NODE_ENV
    }),
    traceExporter,
    instrumentations: [
      getNodeAutoInstrumentations({
        '@opentelemetry/instrumentation-fs': {
          enabled: false, // Disable to reduce noise
        },
      }),
    ],
  });

  sdk.start();
  
  return sdk;
}

Agent Tracing

// agents/base/Tracing.ts
import { trace, context, SpanStatusCode } from '@opentelemetry/api';

export class AgentTracing {
  private tracer;
  
  constructor(agentType: string) {
    this.tracer = trace.getTracer(`autosdlc-${agentType}-agent`);
  }
  
  async traceTask<T>(
    taskType: string,
    taskId: string,
    fn: () => Promise<T>
  ): Promise<T> {
    const span = this.tracer.startSpan(`agent.task.${taskType}`, {
      attributes: {
        'task.id': taskId,
        'task.type': taskType,
        'agent.type': this.agentType
      }
    });
    
    try {
      const result = await context.with(
        trace.setSpan(context.active(), span),
        fn
      );
      
      span.setStatus({ code: SpanStatusCode.OK });
      return result;
    } catch (error) {
      span.setStatus({
        code: SpanStatusCode.ERROR,
        message: error.message
      });
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  }
  
  async traceTDDPhase<T>(
    phase: 'red' | 'green' | 'refactor',
    fn: () => Promise<T>
  ): Promise<T> {
    const span = this.tracer.startSpan(`tdd.phase.${phase}`, {
      attributes: {
        'tdd.phase': phase,
        'agent.type': this.agentType
      }
    });
    
    return context.with(
      trace.setSpan(context.active(), span),
      async () => {
        try {
          const result = await fn();
          span.addEvent(`${phase} phase completed`);
          return result;
        } finally {
          span.end();
        }
      }
    );
  }
  
  traceAgentCommunication(
    targetAgent: string,
    messageType: string,
    correlationId: string
  ): Span {
    const span = this.tracer.startSpan('agent.communication', {
      attributes: {
        'communication.target': targetAgent,
        'communication.type': messageType,
        'communication.correlation_id': correlationId,
        'agent.type': this.agentType
      }
    });
    
    return span;
  }
}

Dashboards

System Overview Dashboard

{
  "dashboard": {
    "title": "AutoSDLC System Overview",
    "panels": [
      {
        "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
        "title": "Active Agents",
        "targets": [{
          "expr": "sum(up{job='autosdlc-agents'}) by (agent_type)"
        }],
        "type": "graph"
      },
      {
        "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
        "title": "Task Processing Rate",
        "targets": [{
          "expr": "sum(rate(agent_tasks_completed_total[5m])) by (agent_type)"
        }],
        "type": "graph"
      },
      {
        "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
        "title": "TDD Phase Distribution",
        "targets": [{
          "expr": "agent_tdd_phase"
        }],
        "type": "heatmap"
      },
      {
        "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
        "title": "Test Coverage",
        "targets": [{
          "expr": "autosdlc_test_coverage_percent"
        }],
        "type": "gauge"
      },
      {
        "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
        "title": "Agent Communication Flow",
        "targets": [{
          "expr": "sum(rate(agent_messages_sent_total[5m])) by (agent_type, target_agent)"
        }],
        "type": "nodeGraph"
      }
    ]
  }
}

Agent Performance Dashboard

# grafana/dashboards/agent-performance.yaml
dashboard:
  title: "Agent Performance Metrics"
  
  variables:
    - name: agent_type
      type: query
      query: "label_values(agent_type)"
      
    - name: time_range
      type: interval
      options: ["5m", "15m", "1h", "6h", "24h", "7d"]
      
  rows:
    - title: "Task Metrics"
      panels:
        - title: "Task Completion Rate"
          query: |
            sum(rate(agent_tasks_completed_total{agent_type="$agent_type"}[5m])) 
            by (task_type, status)
            
        - title: "Task Duration (p95)"
          query: |
            histogram_quantile(0.95, 
              rate(agent_task_duration_seconds_bucket{agent_type="$agent_type"}[5m])
            ) by (task_type)
            
        - title: "Task Queue Depth"
          query: |
            agent_task_queue_size{agent_type="$agent_type"}
            
    - title: "TDD Metrics"
      panels:
        - title: "TDD Compliance"
          query: |
            autosdlc_tdd_compliance_rate{agent_type="$agent_type"}
            
        - title: "Test Results"
          query: |
            agent_test_results{agent_type="$agent_type"}
            
        - title: "Red Phase Violations"
          query: |
            sum(rate(autosdlc_red_phase_violations_total{agent="$agent_type"}[5m]))
            
    - title: "Resource Usage"
      panels:
        - title: "CPU Usage"
          query: |
            avg(agent_cpu_usage_percent{agent_type="$agent_type"}) by (agent_id)
            
        - title: "Memory Usage"
          query: |
            avg(agent_memory_usage_bytes{agent_type="$agent_type"}) by (agent_id)
            
        - title: "Status Update Frequency"
          query: |
            rate(agent_status_updates_total{agent_type="$agent_type"}[5m])

Alerting

Alert Rules

# prometheus/alerts.yml
groups:
  - name: agent_alerts
    interval: 30s
    rules:
      # Agent down
      - alert: AgentDown
        expr: up{job="autosdlc-agents"} == 0
        for: 2m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Agent {{ $labels.agent_type }} is down"
          description: "Agent {{ $labels.instance }} has been down for more than 2 minutes"
          
      # High task failure rate
      - alert: HighTaskFailureRate
        expr: |
          sum(rate(agent_tasks_completed_total{status="failure"}[5m])) by (agent_type)
          /
          sum(rate(agent_tasks_completed_total[5m])) by (agent_type)
          > 0.1
        for: 5m
        labels:
          severity: warning
          team: development
        annotations:
          summary: "High task failure rate for {{ $labels.agent_type }}"
          description: "Task failure rate is {{ $value | humanizePercentage }} for {{ $labels.agent_type }}"
          
      # TDD violations
      - alert: TDDViolation
        expr: rate(autosdlc_red_phase_violations_total[5m]) > 0
        for: 1m
        labels:
          severity: warning
          team: quality
        annotations:
          summary: "TDD violation detected"
          description: "Agent {{ $labels.agent }} attempted to implement without all tests red"
          
      # Agent not updating status
      - alert: AgentStatusStale
        expr: |
          time() - agent_last_status_update_timestamp > 300
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Agent {{ $labels.agent_type }} status is stale"
          description: "Agent has not updated Agent_Output.md for more than 5 minutes"
          
      # Low test coverage
      - alert: LowTestCoverage
        expr: autosdlc_test_coverage_percent < 80
        for: 10m
        labels:
          severity: warning
          team: quality
        annotations:
          summary: "Test coverage below threshold"
          description: "Test coverage for {{ $labels.project }}/{{ $labels.module }} is {{ $value }}%"
          
      # High memory usage
      - alert: AgentHighMemoryUsage
        expr: |
          agent_memory_usage_bytes / (1024 * 1024 * 1024) > 14
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High memory usage for {{ $labels.agent_type }}"
          description: "Agent {{ $labels.agent_id }} is using {{ $value | humanize }}GB of memory"

Alertmanager Configuration

# alertmanager/config.yml
global:
  resolve_timeout: 5m
  
route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default'
  
  routes:
    - match:
        severity: critical
      receiver: pagerduty
      continue: true
      
    - match:
        severity: warning
      receiver: slack
      
    - match:
        team: quality
      receiver: quality-team
      
receivers:
  - name: 'default'
    webhook_configs:
      - url: 'http://autosdlc-webhook:9093/alerts'
        
  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: '<pagerduty-service-key>'
        
  - name: 'slack'
    slack_configs:
      - api_url: '<slack-webhook-url>'
        channel: '#autosdlc-alerts'
        title: 'AutoSDLC Alert'
        text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
        
  - name: 'quality-team'
    email_configs:
      - to: '[email protected]'
        from: '[email protected]'
        smarthost: 'smtp.autosdlc.com:587'
        auth_username: '[email protected]'
        auth_password: '<password>'

Log Analysis

Log Queries

# Loki queries for common scenarios

# Find all TDD violations
{component="agent"} |= "red phase violation" | json

# Track agent communication patterns
{component="agent"} |~ "Message (sent|received)" | json | line_format "{{.timestamp}} {{.from}} -> {{.to}}: {{.messageType}}"

# Monitor test execution
{component="agent", agent_type="coder"} |= "test" | json | line_format "{{.timestamp}} Tests: {{.testsPassing}}/{{.testsTotal}}"

# Error analysis
{component="agent"} |= "error" | json | line_format "{{.timestamp}} [{{.agent_type}}] {{.errorMessage}}"

# Agent status updates
{component="agent"} |= "Agent_Output.md" | json | line_format "{{.timestamp}} [{{.agent_type}}] Status: {{.status}}"

# Performance issues
{component="agent"} | json | duration > 5s

Log Aggregation Rules

# loki/rules.yml
groups:
  - name: agent_logs
    interval: 1m
    rules:
      - record: agent:errors:rate5m
        expr: |
          sum(rate({component="agent"} |= "error" [5m])) by (agent_type)
          
      - record: agent:tdd_violations:rate5m
        expr: |
          sum(rate({component="agent"} |= "red phase violation" [5m])) by (agent_type)
          
      - record: agent:communication:rate5m
        expr: |
          sum(rate({component="agent"} |~ "Message (sent|received)" [5m])) by (agent_type)

Performance Monitoring

SLI/SLO Definition

# slo/definitions.yml
slos:
  - name: "Agent Task Success Rate"
    sli:
      query: |
        sum(rate(agent_tasks_completed_total{status="success"}[5m]))
        /
        sum(rate(agent_tasks_completed_total[5m]))
    target: 0.99
    window: 30d
    
  - name: "TDD Compliance"
    sli:
      query: |
        avg(autosdlc_tdd_compliance_rate)
    target: 1.0  # 100% compliance required
    window: 7d
    
  - name: "Agent Availability"
    sli:
      query: |
        avg(up{job="autosdlc-agents"})
    target: 0.999
    window: 30d
    
  - name: "Task Processing Latency"
    sli:
      query: |
        histogram_quantile(0.95, agent_task_duration_seconds_bucket) < 300
    target: 0.95
    window: 7d
    
  - name: "Test Coverage"
    sli:
      query: |
        min(autosdlc_test_coverage_percent) >= 80
    target: 1.0
    window: 7d

Performance Optimization

// monitoring/PerformanceAnalyzer.ts
export class PerformanceAnalyzer {
  async analyzeAgentPerformance(
    agentType: string,
    timeRange: TimeRange
  ): Promise<PerformanceReport> {
    const metrics = await this.fetchMetrics(agentType, timeRange);
    
    return {
      summary: {
        avgTaskDuration: this.calculateAverage(metrics.taskDurations),
        p95TaskDuration: this.calculatePercentile(metrics.taskDurations, 95),
        successRate: metrics.successCount / metrics.totalCount,
        throughput: metrics.totalCount / timeRange.duration
      },
      
      bottlenecks: this.identifyBottlenecks(metrics),
      
      recommendations: this.generateRecommendations(metrics),
      
      trends: {
        taskDuration: this.analyzeTrend(metrics.taskDurationHistory),
        errorRate: this.analyzeTrend(metrics.errorRateHistory),
        resourceUsage: this.analyzeTrend(metrics.resourceHistory)
      }
    };
  }
  
  private identifyBottlenecks(metrics: AgentMetrics): Bottleneck[] {
    const bottlenecks = [];
    
    // High wait times
    if (metrics.avgWaitTime > 60) {
      bottlenecks.push({
        type: 'agent_waiting',
        severity: 'high',
        description: `Agent spends avg ${metrics.avgWaitTime}s waiting for other agents`,
        recommendation: 'Consider scaling blocking agents'
      });
    }
    
    // Memory pressure
    if (metrics.memoryUsage > 0.8 * metrics.memoryLimit) {
      bottlenecks.push({
        type: 'memory_pressure',
        severity: 'medium',
        description: `Memory usage at ${(metrics.memoryUsage / metrics.memoryLimit * 100).toFixed(1)}% of limit`,
        recommendation: 'Increase memory limit or optimize memory usage'
      });
    }
    
    // Test execution time
    if (metrics.avgTestDuration > 300) {
      bottlenecks.push({
        type: 'slow_tests',
        severity: 'medium',
        description: `Average test duration ${metrics.avgTestDuration}s exceeds threshold`,
        recommendation: 'Optimize test suite or parallelize test execution'
      });
    }
    
    return bottlenecks;
  }
}

Deployment

Kubernetes Deployment

# k8s/monitoring/monitoring-stack.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring

---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: prometheus
  namespace: monitoring
spec:
  serviceName: prometheus
  replicas: 2
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      - name: prometheus
        image: prom/prometheus:v2.45.0
        args:
          - '--config.file=/etc/prometheus/prometheus.yml'
          - '--storage.tsdb.path=/prometheus'
          - '--storage.tsdb.retention.time=30d'
          - '--web.enable-lifecycle'
        ports:
        - containerPort: 9090
        volumeMounts:
        - name: config
          mountPath: /etc/prometheus
        - name: storage
          mountPath: /prometheus
        resources:
          requests:
            memory: 2Gi
            cpu: 1
          limits:
            memory: 4Gi
            cpu: 2
      volumes:
      - name: config
        configMap:
          name: prometheus-config
  volumeClaimTemplates:
  - metadata:
      name: storage
    spec:
      accessModes: ["ReadWriteOnce"]
      resources:
        requests:
          storage: 100Gi

---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: loki
  namespace: monitoring
spec:
  serviceName: loki
  replicas: 3
  selector:
    matchLabels:
      app: loki
  template:
    metadata:
      labels:
        app: loki
    spec:
      containers:
      - name: loki
        image: grafana/loki:2.9.0
        args:
          - '-config.file=/etc/loki/loki.yaml'
        ports:
        - containerPort: 3100
        volumeMounts:
        - name: config
          mountPath: /etc/loki
        - name: storage
          mountPath: /loki
        resources:
          requests:
            memory: 2Gi
            cpu: 1
          limits:
            memory: 4Gi
            cpu: 2
      volumes:
      - name: config
        configMap:
          name: loki-config
  volumeClaimTemplates:
  - metadata:
      name: storage
    spec:
      accessModes: ["ReadWriteOnce"]
      resources:
        requests:
          storage: 50Gi

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
spec:
  replicas: 2
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
      - name: grafana
        image: grafana/grafana:10.2.0
        ports:
        - containerPort: 3000
        env:
        - name: GF_SECURITY_ADMIN_PASSWORD
          valueFrom:
            secretKeyRef:
              name: grafana-secrets
              key: admin-password
        volumeMounts:
        - name: datasources
          mountPath: /etc/grafana/provisioning/datasources
        - name: dashboards
          mountPath: /etc/grafana/provisioning/dashboards
        - name: dashboard-files
          mountPath: /var/lib/grafana/dashboards
        resources:
          requests:
            memory: 512Mi
            cpu: 500m
          limits:
            memory: 1Gi
            cpu: 1
      volumes:
      - name: datasources
        configMap:
          name: grafana-datasources
      - name: dashboards
        configMap:
          name: grafana-dashboard-providers
      - name: dashboard-files
        configMap:
          name: grafana-dashboards

Best Practices

1. Metrics Design

Use consistent naming conventions
Include relevant labels
Avoid high cardinality
Document metric purposes

2. Logging Standards

Use structured logging
Include correlation IDs
Log at appropriate levels
Avoid logging sensitive data

3. Tracing Strategy

Trace critical paths
Include business context
Set sampling appropriately
Link traces to logs

4. Dashboard Design

Focus on actionable metrics
Use consistent layouts
Include drill-down capability
Provide context and thresholds

5. Alert Management

Define clear SLOs
Avoid alert fatigue
Include runbooks
Test alert paths

Troubleshooting

High Memory Usage

# Check Prometheus memory usage
kubectl top pod -n monitoring | grep prometheus

# Analyze cardinality
curl -s http://prometheus:9090/api/v1/label/__name__/values | jq '. | length'

# Check top series
curl -s http://prometheus:9090/api/v1/query?query=topk(10,count_by_series()) | jq

Missing Metrics

# Check agent metrics endpoint
kubectl exec -it <agent-pod> -n autosdlc-agents -- curl localhost:9090/metrics

# Verify Prometheus scraping
curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job=="autosdlc-agents")'

Log Ingestion Issues

# Check Loki ingestion rate
curl -s http://loki:3100/metrics | grep -E "loki_ingester_chunks_created_total|loki_ingester_streams_created_total"

# Verify log format
kubectl logs <agent-pod> -n autosdlc-agents | jq -r 'select(.level=="error")'