Monitoring Setup - reza899/AutoSDLC GitHub Wiki
#AutoSDLC #Monitoring #Logging #Observability
← Back to Index | ← Deployment Guide
The Monitoring & Logging Setup provides comprehensive observability for the AutoSDLC system, including agent performance tracking, system health monitoring, distributed tracing, and centralized logging. This guide covers the implementation of monitoring infrastructure for both development and production environments.
graph TB
subgraph "Data Sources"
AG[Agents]
API[API Servers]
MCP[MCP Servers]
DB[Databases]
K8S[Kubernetes]
end
subgraph "Collection Layer"
PROM[Prometheus]
LOKI[Loki]
TEMPO[Tempo]
OT[OpenTelemetry]
end
subgraph "Storage"
TSDB[(Time Series DB)]
LOGS[(Log Storage)]
TRACES[(Trace Storage)]
end
subgraph "Visualization"
GRAF[Grafana]
ALERT[Alertmanager]
DASH[Dashboards]
end
AG & API & MCP & DB & K8S --> OT
OT --> PROM & LOKI & TEMPO
PROM --> TSDB
LOKI --> LOGS
TEMPO --> TRACES
TSDB & LOGS & TRACES --> GRAF
PROM --> ALERT
GRAF --> DASH
monitoring_stack:
metrics:
prometheus:
version: "2.45.0"
retention: "30d"
scrape_interval: "15s"
logging:
loki:
version: "2.9.0"
retention: "90d"
ingestion_rate: "10MB/s"
tracing:
tempo:
version: "2.3.0"
retention: "7d"
visualization:
grafana:
version: "10.2.0"
dashboards:
- system-overview
- agent-performance
- api-metrics
- error-analysis
alerting:
alertmanager:
version: "0.26.0"
receivers:
- pagerduty
- slack
- email
// agents/base/metrics.ts
import { register, Counter, Histogram, Gauge } from 'prom-client';
export class AgentMetrics {
// Task metrics
private tasksReceived = new Counter({
name: 'agent_tasks_received_total',
help: 'Total number of tasks received',
labelNames: ['agent_type', 'task_type']
});
private tasksCompleted = new Counter({
name: 'agent_tasks_completed_total',
help: 'Total number of tasks completed',
labelNames: ['agent_type', 'task_type', 'status']
});
private taskDuration = new Histogram({
name: 'agent_task_duration_seconds',
help: 'Task execution duration in seconds',
labelNames: ['agent_type', 'task_type'],
buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 300, 600]
});
// TDD metrics
private testsRun = new Counter({
name: 'agent_tests_run_total',
help: 'Total number of test runs',
labelNames: ['agent_type', 'phase']
});
private testResults = new Gauge({
name: 'agent_test_results',
help: 'Test results (passing/total)',
labelNames: ['agent_type', 'result_type']
});
private tddPhase = new Gauge({
name: 'agent_tdd_phase',
help: 'Current TDD phase (1=red, 2=green, 3=refactor)',
labelNames: ['agent_type']
});
// Resource metrics
private cpuUsage = new Gauge({
name: 'agent_cpu_usage_percent',
help: 'CPU usage percentage',
labelNames: ['agent_type', 'agent_id']
});
private memoryUsage = new Gauge({
name: 'agent_memory_usage_bytes',
help: 'Memory usage in bytes',
labelNames: ['agent_type', 'agent_id']
});
// Communication metrics
private messagessSent = new Counter({
name: 'agent_messages_sent_total',
help: 'Total messages sent',
labelNames: ['agent_type', 'target_agent', 'message_type']
});
private messagesReceived = new Counter({
name: 'agent_messages_received_total',
help: 'Total messages received',
labelNames: ['agent_type', 'source_agent', 'message_type']
});
private messageLatency = new Histogram({
name: 'agent_message_latency_seconds',
help: 'Message handling latency',
labelNames: ['agent_type', 'message_type'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
});
// Status update metrics
private statusUpdates = new Counter({
name: 'agent_status_updates_total',
help: 'Total status updates written to Agent_Output.md',
labelNames: ['agent_type']
});
private lastStatusUpdate = new Gauge({
name: 'agent_last_status_update_timestamp',
help: 'Timestamp of last status update',
labelNames: ['agent_type', 'agent_id']
});
recordTaskReceived(agentType: string, taskType: string): void {
this.tasksReceived.inc({ agent_type: agentType, task_type: taskType });
}
recordTaskCompleted(
agentType: string,
taskType: string,
status: 'success' | 'failure',
duration: number
): void {
this.tasksCompleted.inc({
agent_type: agentType,
task_type: taskType,
status
});
this.taskDuration.observe(
{ agent_type: agentType, task_type: taskType },
duration
);
}
recordTDDPhase(agentType: string, phase: 'red' | 'green' | 'refactor'): void {
const phaseMap = { red: 1, green: 2, refactor: 3 };
this.tddPhase.set({ agent_type: agentType }, phaseMap[phase]);
}
recordTestResults(
agentType: string,
passing: number,
total: number
): void {
this.testResults.set({ agent_type: agentType, result_type: 'passing' }, passing);
this.testResults.set({ agent_type: agentType, result_type: 'total' }, total);
}
}
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Agent metrics
- job_name: 'autosdlc-agents'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- autosdlc-agents
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- source_labels: [__meta_kubernetes_pod_label_agent_type]
action: replace
target_label: agent_type
# MCP Server metrics
- job_name: 'mcp-servers'
static_configs:
- targets:
- 'mcp-server-1:9090'
- 'mcp-server-2:9090'
- 'mcp-server-3:9090'
# Node exporter for system metrics
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Kubernetes metrics
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
// services/MetricsCollector.ts
export class MetricsCollector {
private customMetrics = new Map<string, any>();
// TDD Compliance Metrics
registerTDDMetrics(): void {
this.customMetrics.set('tdd_compliance_rate', new Gauge({
name: 'autosdlc_tdd_compliance_rate',
help: 'Percentage of implementations following TDD',
labelNames: ['project', 'sprint']
}));
this.customMetrics.set('tests_written_first', new Counter({
name: 'autosdlc_tests_written_first_total',
help: 'Number of features with tests written first',
labelNames: ['agent', 'feature_type']
}));
this.customMetrics.set('red_phase_violations', new Counter({
name: 'autosdlc_red_phase_violations_total',
help: 'Number of times implementation started without all tests red',
labelNames: ['agent', 'reason']
}));
}
// Agent Collaboration Metrics
registerCollaborationMetrics(): void {
this.customMetrics.set('agent_handoffs', new Counter({
name: 'autosdlc_agent_handoffs_total',
help: 'Number of task handoffs between agents',
labelNames: ['from_agent', 'to_agent', 'task_type']
}));
this.customMetrics.set('agent_wait_time', new Histogram({
name: 'autosdlc_agent_wait_time_seconds',
help: 'Time agents spend waiting for other agents',
labelNames: ['waiting_agent', 'blocking_agent'],
buckets: [1, 5, 10, 30, 60, 300, 600, 1800, 3600]
}));
}
// Code Quality Metrics
registerQualityMetrics(): void {
this.customMetrics.set('code_complexity', new Gauge({
name: 'autosdlc_code_complexity',
help: 'Cyclomatic complexity of generated code',
labelNames: ['agent', 'module']
}));
this.customMetrics.set('test_coverage', new Gauge({
name: 'autosdlc_test_coverage_percent',
help: 'Test coverage percentage',
labelNames: ['project', 'module']
}));
this.customMetrics.set('review_iterations', new Histogram({
name: 'autosdlc_review_iterations',
help: 'Number of review iterations before approval',
labelNames: ['reviewer_agent', 'coder_agent'],
buckets: [1, 2, 3, 4, 5, 10]
}));
}
}
// utils/Logger.ts
import winston from 'winston';
import { LogstashTransport } from 'winston-logstash-transport';
export class StructuredLogger {
private logger: winston.Logger;
constructor(component: string, agentType?: string) {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
component,
agentType,
environment: process.env.NODE_ENV,
version: process.env.APP_VERSION
},
transports: [
// Console transport for development
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
// Loki transport for production
new LogstashTransport({
host: process.env.LOKI_HOST || 'loki',
port: parseInt(process.env.LOKI_PORT || '5000'),
ssl_enable: true,
max_connect_retries: -1,
timeout_connect_retries: 60000
})
]
});
}
// Structured log methods
logTaskReceived(task: Task, context: LogContext): void {
this.logger.info('Task received', {
event: 'task_received',
taskId: task.id,
taskType: task.type,
priority: task.priority,
...context
});
}
logTDDPhaseChange(phase: string, details: TDDDetails): void {
this.logger.info('TDD phase change', {
event: 'tdd_phase_change',
phase,
testsTotal: details.testsTotal,
testsPassing: details.testsPassing,
testsFailing: details.testsFailing,
coverage: details.coverage,
timestamp: new Date().toISOString()
});
}
logAgentCommunication(comm: AgentCommunication): void {
this.logger.info('Agent communication', {
event: 'agent_communication',
from: comm.from,
to: comm.to,
messageType: comm.messageType,
correlationId: comm.correlationId,
duration: comm.duration
});
}
logError(error: Error, context: ErrorContext): void {
this.logger.error('Error occurred', {
event: 'error',
errorType: error.name,
errorMessage: error.message,
errorStack: error.stack,
...context,
timestamp: new Date().toISOString()
});
}
}
# loki/loki-config.yaml
auth_enabled: false
server:
http_listen_port: 3100
log_level: info
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
max_transfer_retries: 0
schema_config:
configs:
- from: 2025-01-01
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: /loki/index
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: true
retention_period: 2160h # 90 days
// agents/base/LogPatterns.ts
export const AgentLogPatterns = {
// Task lifecycle
TASK_START: {
pattern: /Task started: (\w+) \(ID: ([\w-]+)\)/,
fields: ['taskType', 'taskId']
},
TASK_COMPLETE: {
pattern: /Task completed: (\w+) \(ID: ([\w-]+)\) - Duration: (\d+)ms/,
fields: ['taskType', 'taskId', 'duration']
},
// TDD patterns
TDD_RED_VERIFIED: {
pattern: /TDD Red phase verified: (\d+)\/(\d+) tests failing/,
fields: ['failingTests', 'totalTests']
},
TDD_GREEN_ACHIEVED: {
pattern: /TDD Green phase achieved: All (\d+) tests passing/,
fields: ['totalTests']
},
TDD_REFACTOR_COMPLETE: {
pattern: /TDD Refactor complete: (\d+) improvements made/,
fields: ['improvements']
},
// Agent status
STATUS_UPDATE: {
pattern: /Agent status updated: (\w+) - File: Agent_Output\.md/,
fields: ['status']
},
// Communication
MESSAGE_SENT: {
pattern: /Message sent to (\w+): Type=(\w+), ID=([\w-]+)/,
fields: ['targetAgent', 'messageType', 'messageId']
},
MESSAGE_RECEIVED: {
pattern: /Message received from (\w+): Type=(\w+), ID=([\w-]+)/,
fields: ['sourceAgent', 'messageType', 'messageId']
}
};
// Log parser for analysis
export class AgentLogParser {
parseLogLine(line: string): ParsedLog | null {
for (const [eventType, config] of Object.entries(AgentLogPatterns)) {
const match = line.match(config.pattern);
if (match) {
const fields: any = {};
config.fields.forEach((field, index) => {
fields[field] = match[index + 1];
});
return {
eventType,
fields,
timestamp: this.extractTimestamp(line),
raw: line
};
}
}
return null;
}
}
// tracing/setup.ts
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-grpc';
export function initializeTracing(serviceName: string, agentType?: string) {
const traceExporter = new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4317',
});
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: serviceName,
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION || '1.0.0',
'agent.type': agentType,
'environment': process.env.NODE_ENV
}),
traceExporter,
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-fs': {
enabled: false, // Disable to reduce noise
},
}),
],
});
sdk.start();
return sdk;
}
// agents/base/Tracing.ts
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
export class AgentTracing {
private tracer;
constructor(agentType: string) {
this.tracer = trace.getTracer(`autosdlc-${agentType}-agent`);
}
async traceTask<T>(
taskType: string,
taskId: string,
fn: () => Promise<T>
): Promise<T> {
const span = this.tracer.startSpan(`agent.task.${taskType}`, {
attributes: {
'task.id': taskId,
'task.type': taskType,
'agent.type': this.agentType
}
});
try {
const result = await context.with(
trace.setSpan(context.active(), span),
fn
);
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
span.recordException(error);
throw error;
} finally {
span.end();
}
}
async traceTDDPhase<T>(
phase: 'red' | 'green' | 'refactor',
fn: () => Promise<T>
): Promise<T> {
const span = this.tracer.startSpan(`tdd.phase.${phase}`, {
attributes: {
'tdd.phase': phase,
'agent.type': this.agentType
}
});
return context.with(
trace.setSpan(context.active(), span),
async () => {
try {
const result = await fn();
span.addEvent(`${phase} phase completed`);
return result;
} finally {
span.end();
}
}
);
}
traceAgentCommunication(
targetAgent: string,
messageType: string,
correlationId: string
): Span {
const span = this.tracer.startSpan('agent.communication', {
attributes: {
'communication.target': targetAgent,
'communication.type': messageType,
'communication.correlation_id': correlationId,
'agent.type': this.agentType
}
});
return span;
}
}
{
"dashboard": {
"title": "AutoSDLC System Overview",
"panels": [
{
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"title": "Active Agents",
"targets": [{
"expr": "sum(up{job='autosdlc-agents'}) by (agent_type)"
}],
"type": "graph"
},
{
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"title": "Task Processing Rate",
"targets": [{
"expr": "sum(rate(agent_tasks_completed_total[5m])) by (agent_type)"
}],
"type": "graph"
},
{
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"title": "TDD Phase Distribution",
"targets": [{
"expr": "agent_tdd_phase"
}],
"type": "heatmap"
},
{
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"title": "Test Coverage",
"targets": [{
"expr": "autosdlc_test_coverage_percent"
}],
"type": "gauge"
},
{
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
"title": "Agent Communication Flow",
"targets": [{
"expr": "sum(rate(agent_messages_sent_total[5m])) by (agent_type, target_agent)"
}],
"type": "nodeGraph"
}
]
}
}
# grafana/dashboards/agent-performance.yaml
dashboard:
title: "Agent Performance Metrics"
variables:
- name: agent_type
type: query
query: "label_values(agent_type)"
- name: time_range
type: interval
options: ["5m", "15m", "1h", "6h", "24h", "7d"]
rows:
- title: "Task Metrics"
panels:
- title: "Task Completion Rate"
query: |
sum(rate(agent_tasks_completed_total{agent_type="$agent_type"}[5m]))
by (task_type, status)
- title: "Task Duration (p95)"
query: |
histogram_quantile(0.95,
rate(agent_task_duration_seconds_bucket{agent_type="$agent_type"}[5m])
) by (task_type)
- title: "Task Queue Depth"
query: |
agent_task_queue_size{agent_type="$agent_type"}
- title: "TDD Metrics"
panels:
- title: "TDD Compliance"
query: |
autosdlc_tdd_compliance_rate{agent_type="$agent_type"}
- title: "Test Results"
query: |
agent_test_results{agent_type="$agent_type"}
- title: "Red Phase Violations"
query: |
sum(rate(autosdlc_red_phase_violations_total{agent="$agent_type"}[5m]))
- title: "Resource Usage"
panels:
- title: "CPU Usage"
query: |
avg(agent_cpu_usage_percent{agent_type="$agent_type"}) by (agent_id)
- title: "Memory Usage"
query: |
avg(agent_memory_usage_bytes{agent_type="$agent_type"}) by (agent_id)
- title: "Status Update Frequency"
query: |
rate(agent_status_updates_total{agent_type="$agent_type"}[5m])
# prometheus/alerts.yml
groups:
- name: agent_alerts
interval: 30s
rules:
# Agent down
- alert: AgentDown
expr: up{job="autosdlc-agents"} == 0
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "Agent {{ $labels.agent_type }} is down"
description: "Agent {{ $labels.instance }} has been down for more than 2 minutes"
# High task failure rate
- alert: HighTaskFailureRate
expr: |
sum(rate(agent_tasks_completed_total{status="failure"}[5m])) by (agent_type)
/
sum(rate(agent_tasks_completed_total[5m])) by (agent_type)
> 0.1
for: 5m
labels:
severity: warning
team: development
annotations:
summary: "High task failure rate for {{ $labels.agent_type }}"
description: "Task failure rate is {{ $value | humanizePercentage }} for {{ $labels.agent_type }}"
# TDD violations
- alert: TDDViolation
expr: rate(autosdlc_red_phase_violations_total[5m]) > 0
for: 1m
labels:
severity: warning
team: quality
annotations:
summary: "TDD violation detected"
description: "Agent {{ $labels.agent }} attempted to implement without all tests red"
# Agent not updating status
- alert: AgentStatusStale
expr: |
time() - agent_last_status_update_timestamp > 300
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Agent {{ $labels.agent_type }} status is stale"
description: "Agent has not updated Agent_Output.md for more than 5 minutes"
# Low test coverage
- alert: LowTestCoverage
expr: autosdlc_test_coverage_percent < 80
for: 10m
labels:
severity: warning
team: quality
annotations:
summary: "Test coverage below threshold"
description: "Test coverage for {{ $labels.project }}/{{ $labels.module }} is {{ $value }}%"
# High memory usage
- alert: AgentHighMemoryUsage
expr: |
agent_memory_usage_bytes / (1024 * 1024 * 1024) > 14
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "High memory usage for {{ $labels.agent_type }}"
description: "Agent {{ $labels.agent_id }} is using {{ $value | humanize }}GB of memory"
# alertmanager/config.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: pagerduty
continue: true
- match:
severity: warning
receiver: slack
- match:
team: quality
receiver: quality-team
receivers:
- name: 'default'
webhook_configs:
- url: 'http://autosdlc-webhook:9093/alerts'
- name: 'pagerduty'
pagerduty_configs:
- service_key: '<pagerduty-service-key>'
- name: 'slack'
slack_configs:
- api_url: '<slack-webhook-url>'
channel: '#autosdlc-alerts'
title: 'AutoSDLC Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
- name: 'quality-team'
email_configs:
- to: '[email protected]'
from: '[email protected]'
smarthost: 'smtp.autosdlc.com:587'
auth_username: '[email protected]'
auth_password: '<password>'
# Loki queries for common scenarios
# Find all TDD violations
{component="agent"} |= "red phase violation" | json
# Track agent communication patterns
{component="agent"} |~ "Message (sent|received)" | json | line_format "{{.timestamp}} {{.from}} -> {{.to}}: {{.messageType}}"
# Monitor test execution
{component="agent", agent_type="coder"} |= "test" | json | line_format "{{.timestamp}} Tests: {{.testsPassing}}/{{.testsTotal}}"
# Error analysis
{component="agent"} |= "error" | json | line_format "{{.timestamp}} [{{.agent_type}}] {{.errorMessage}}"
# Agent status updates
{component="agent"} |= "Agent_Output.md" | json | line_format "{{.timestamp}} [{{.agent_type}}] Status: {{.status}}"
# Performance issues
{component="agent"} | json | duration > 5s
# loki/rules.yml
groups:
- name: agent_logs
interval: 1m
rules:
- record: agent:errors:rate5m
expr: |
sum(rate({component="agent"} |= "error" [5m])) by (agent_type)
- record: agent:tdd_violations:rate5m
expr: |
sum(rate({component="agent"} |= "red phase violation" [5m])) by (agent_type)
- record: agent:communication:rate5m
expr: |
sum(rate({component="agent"} |~ "Message (sent|received)" [5m])) by (agent_type)
# slo/definitions.yml
slos:
- name: "Agent Task Success Rate"
sli:
query: |
sum(rate(agent_tasks_completed_total{status="success"}[5m]))
/
sum(rate(agent_tasks_completed_total[5m]))
target: 0.99
window: 30d
- name: "TDD Compliance"
sli:
query: |
avg(autosdlc_tdd_compliance_rate)
target: 1.0 # 100% compliance required
window: 7d
- name: "Agent Availability"
sli:
query: |
avg(up{job="autosdlc-agents"})
target: 0.999
window: 30d
- name: "Task Processing Latency"
sli:
query: |
histogram_quantile(0.95, agent_task_duration_seconds_bucket) < 300
target: 0.95
window: 7d
- name: "Test Coverage"
sli:
query: |
min(autosdlc_test_coverage_percent) >= 80
target: 1.0
window: 7d
// monitoring/PerformanceAnalyzer.ts
export class PerformanceAnalyzer {
async analyzeAgentPerformance(
agentType: string,
timeRange: TimeRange
): Promise<PerformanceReport> {
const metrics = await this.fetchMetrics(agentType, timeRange);
return {
summary: {
avgTaskDuration: this.calculateAverage(metrics.taskDurations),
p95TaskDuration: this.calculatePercentile(metrics.taskDurations, 95),
successRate: metrics.successCount / metrics.totalCount,
throughput: metrics.totalCount / timeRange.duration
},
bottlenecks: this.identifyBottlenecks(metrics),
recommendations: this.generateRecommendations(metrics),
trends: {
taskDuration: this.analyzeTrend(metrics.taskDurationHistory),
errorRate: this.analyzeTrend(metrics.errorRateHistory),
resourceUsage: this.analyzeTrend(metrics.resourceHistory)
}
};
}
private identifyBottlenecks(metrics: AgentMetrics): Bottleneck[] {
const bottlenecks = [];
// High wait times
if (metrics.avgWaitTime > 60) {
bottlenecks.push({
type: 'agent_waiting',
severity: 'high',
description: `Agent spends avg ${metrics.avgWaitTime}s waiting for other agents`,
recommendation: 'Consider scaling blocking agents'
});
}
// Memory pressure
if (metrics.memoryUsage > 0.8 * metrics.memoryLimit) {
bottlenecks.push({
type: 'memory_pressure',
severity: 'medium',
description: `Memory usage at ${(metrics.memoryUsage / metrics.memoryLimit * 100).toFixed(1)}% of limit`,
recommendation: 'Increase memory limit or optimize memory usage'
});
}
// Test execution time
if (metrics.avgTestDuration > 300) {
bottlenecks.push({
type: 'slow_tests',
severity: 'medium',
description: `Average test duration ${metrics.avgTestDuration}s exceeds threshold`,
recommendation: 'Optimize test suite or parallelize test execution'
});
}
return bottlenecks;
}
}
# k8s/monitoring/monitoring-stack.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
spec:
serviceName: prometheus
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.45.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: storage
mountPath: /prometheus
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
volumes:
- name: config
configMap:
name: prometheus-config
volumeClaimTemplates:
- metadata:
name: storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
namespace: monitoring
spec:
serviceName: loki
replicas: 3
selector:
matchLabels:
app: loki
template:
metadata:
labels:
app: loki
spec:
containers:
- name: loki
image: grafana/loki:2.9.0
args:
- '-config.file=/etc/loki/loki.yaml'
ports:
- containerPort: 3100
volumeMounts:
- name: config
mountPath: /etc/loki
- name: storage
mountPath: /loki
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
volumes:
- name: config
configMap:
name: loki-config
volumeClaimTemplates:
- metadata:
name: storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
spec:
replicas: 2
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:10.2.0
ports:
- containerPort: 3000
env:
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: admin-password
volumeMounts:
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
- name: dashboards
mountPath: /etc/grafana/provisioning/dashboards
- name: dashboard-files
mountPath: /var/lib/grafana/dashboards
resources:
requests:
memory: 512Mi
cpu: 500m
limits:
memory: 1Gi
cpu: 1
volumes:
- name: datasources
configMap:
name: grafana-datasources
- name: dashboards
configMap:
name: grafana-dashboard-providers
- name: dashboard-files
configMap:
name: grafana-dashboards
- Use consistent naming conventions
- Include relevant labels
- Avoid high cardinality
- Document metric purposes
- Use structured logging
- Include correlation IDs
- Log at appropriate levels
- Avoid logging sensitive data
- Trace critical paths
- Include business context
- Set sampling appropriately
- Link traces to logs
- Focus on actionable metrics
- Use consistent layouts
- Include drill-down capability
- Provide context and thresholds
- Define clear SLOs
- Avoid alert fatigue
- Include runbooks
- Test alert paths
# Check Prometheus memory usage
kubectl top pod -n monitoring | grep prometheus
# Analyze cardinality
curl -s http://prometheus:9090/api/v1/label/__name__/values | jq '. | length'
# Check top series
curl -s http://prometheus:9090/api/v1/query?query=topk(10,count_by_series()) | jq
# Check agent metrics endpoint
kubectl exec -it <agent-pod> -n autosdlc-agents -- curl localhost:9090/metrics
# Verify Prometheus scraping
curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job=="autosdlc-agents")'
# Check Loki ingestion rate
curl -s http://loki:3100/metrics | grep -E "loki_ingester_chunks_created_total|loki_ingester_streams_created_total"
# Verify log format
kubectl logs <agent-pod> -n autosdlc-agents | jq -r 'select(.level=="error")'
Tags: #AutoSDLC #Monitoring #Logging #Observability #Metrics Last Updated: 2025-06-09 Next: Security Guidelines →