Monitoring & Observability - yuzvak/flashsale-service GitHub Wiki
Monitoring & Observability
Overview
The flash sale service implements comprehensive monitoring using Prometheus metrics and Grafana dashboards, following the Four Golden Signals methodology.
Metrics Architecture
Four Golden Signals Implementation
1. Latency
HTTPRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "Duration of HTTP requests in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"handler", "method", "status_code"},
)
2. Traffic
HTTPRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"handler", "method", "status_code"},
)
3. Errors
- HTTP error rates by status code (4xx, 5xx)
- Business logic errors (checkout failures, purchase failures)
- Database and Redis operation failures
4. Saturation
- Database connection pool utilization
- Memory usage and goroutine counts
- Redis connection pool metrics
Business Metrics
Flash Sale Specific Metrics
// Sale progress tracking
SaleItemsTotal = promauto.NewGauge(prometheus.GaugeOpts{
Name: "sale_items_total",
Help: "Total number of items in sale",
})
SaleItemsSold = promauto.NewGauge(prometheus.GaugeOpts{
Name: "sale_items_sold",
Help: "Number of items sold in sale",
})
// Checkout flow metrics
CheckoutAttemptsTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "checkout_attempts_total",
Help: "Total number of checkout attempts",
})
CheckoutSuccessTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "checkout_success_total",
Help: "Total number of successful checkouts",
})
// Purchase flow metrics
PurchaseAttemptsTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "purchase_attempts_total",
Help: "Total number of purchase attempts",
})
PurchaseSuccessTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "purchase_success_total",
Help: "Total number of successful purchases",
})
Database Monitoring
Query Performance
DBQueryDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "db_query_duration_seconds",
Help: "Duration of database queries in seconds",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{"query_type", "table"},
)
Connection Pool Monitoring
func (c *DBMetricsCollector) collectMetrics() {
stats := c.db.Stats()
DBConnectionsActive.Set(float64(stats.InUse))
DBConnectionsIdle.Set(float64(stats.Idle))
}
Redis Monitoring
Command Performance
RedisCommandDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "redis_command_duration_seconds",
Help: "Duration of Redis commands in seconds",
Buckets: []float64{0.0001, 0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
},
[]string{"command"},
)
Distributed Lock Metrics
RedisLockAttemptsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "redis_lock_attempts_total",
Help: "Total number of distributed lock attempts",
},
[]string{"lock_type"},
)
RedisLockDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "redis_lock_duration_seconds",
Help: "Duration of lock hold time in seconds",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
},
[]string{"lock_type"},
)
Health Monitoring
Health Check Endpoint
type HealthData struct {
ServicesStatus ServicesStatus `json:"services_status"`
Uptime string `json:"uptime"`
Memory MemoryMetrics `json:"memory"`
Goroutines int `json:"goroutines"`
}
type ServicesStatus struct {
App string `json:"app"`
Database string `json:"database"`
Redis string `json:"redis"`
}
Memory Metrics
type MemoryMetrics struct {
Alloc uint64 `json:"alloc"`
TotalAlloc uint64 `json:"total_alloc"`
Sys uint64 `json:"sys"`
NumGC uint32 `json:"num_gc"`
}
Instrumentation Patterns
HTTP Middleware
type HTTPMetricsMiddleware struct {
next http.Handler
}
func (m *HTTPMetricsMiddleware) ServeHTTP(w http.ResponseWriter, r *http.Request) {
start := time.Now()
wrapped := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
m.next.ServeHTTP(wrapped, r)
duration := time.Since(start).Seconds()
statusCode := strconv.Itoa(wrapped.statusCode)
handlerName := extractHandlerName(r.URL.Path)
HTTPRequestDuration.WithLabelValues(handlerName, r.Method, statusCode).Observe(duration)
HTTPRequestsTotal.WithLabelValues(handlerName, r.Method, statusCode).Inc()
}
Database Instrumentation
func InstrumentQuery(ctx context.Context, db *sql.DB, queryType, table, query string, args ...interface{}) (*sql.Rows, error) {
end := TimeDBQuery(queryType, table)
defer end()
return db.QueryContext(ctx, query, args...)
}
Redis Hook Implementation
type RedisHook struct{}
func (RedisHook) BeforeProcess(ctx context.Context, cmd redis.Cmder) (context.Context, error) {
return context.WithValue(ctx, "start_time", time.Now()), nil
}
func (RedisHook) AfterProcess(ctx context.Context, cmd redis.Cmder) error {
start, ok := ctx.Value("start_time").(time.Time)
if !ok {
return nil
}
duration := time.Since(start).Seconds()
RedisCommandDuration.WithLabelValues(cmd.Name()).Observe(duration)
return nil
}
Grafana Dashboard
Dashboard Structure
- Four Golden Signals: Latency, Traffic, Errors, Saturation
- Flash Sale Specific: Items sold rate, checkout/purchase rates
- Database & Cache: Query performance, connection pools
- System Resources: Memory, CPU, goroutines
Key Panels
- P95/P50 response time trends
- Request rate by handler
- Error rate breakdown
- Sale progress visualization
- Lock contention metrics
Alerting Strategy
Critical Alerts
- Service health check failures
- High error rates (>5%)
- Database connection exhaustion
- Redis connectivity issues
Performance Alerts
- P95 response time >2s
- High lock contention
- Memory usage >80%
- Goroutine count anomalies
Deployment Configuration
Docker Compose Setup
services:
prometheus:
image: prom/prometheus:v2.30.3
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
grafana:
image: grafana/grafana:8.2.2
volumes:
- ./grafana-dashboards:/etc/grafana/provisioning/dashboards
- ./grafana-datasources:/etc/grafana/provisioning/datasources
ports:
- "3000:3000"
Prometheus Configuration
scrape_configs:
- job_name: "flashsale"
metrics_path: /metrics
scrape_interval: 5s
static_configs:
- targets: ["host.docker.internal:8080"]
labels:
instance: "flashsale-service"
environment: "development"