Monitoring - RumenDamyanov/js-chess GitHub Wiki

Monitoring & Logging

Comprehensive monitoring, logging, and observability setup for the chess showcase application.

Overview

This guide covers monitoring and observability for:

  • Application performance monitoring (APM)
  • Error tracking and alerting
  • User behavior analytics
  • Infrastructure monitoring
  • Log aggregation and analysis
  • Real-time metrics and dashboards
  • Uptime monitoring

Application Performance Monitoring

Performance Metrics Collection

// shared/monitoring/PerformanceTracker.js
export class PerformanceTracker {
  constructor() {
    this.metrics = new Map();
    this.startTimes = new Map();
    this.observers = [];
    this.isEnabled = process.env.NODE_ENV === 'production';

    this.initializeObservers();
  }

  initializeObservers() {
    if (!this.isEnabled || typeof window === 'undefined') return;

    // Performance Observer for various metrics
    if ('PerformanceObserver' in window) {
      // Navigation timing
      const navObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          this.recordNavigationMetrics(entry);
        }
      });
      navObserver.observe({ entryTypes: ['navigation'] });
      this.observers.push(navObserver);

      // Paint timing
      const paintObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          this.recordPaintMetrics(entry);
        }
      });
      paintObserver.observe({ entryTypes: ['paint'] });
      this.observers.push(paintObserver);

      // Largest Contentful Paint
      const lcpObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          this.recordMetric('largest_contentful_paint', entry.renderTime || entry.loadTime);
        }
      });
      lcpObserver.observe({ entryTypes: ['largest-contentful-paint'] });
      this.observers.push(lcpObserver);

      // Layout Shift
      let clsValue = 0;
      const clsObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          if (!entry.hadRecentInput) {
            clsValue += entry.value;
            this.recordMetric('cumulative_layout_shift', clsValue);
          }
        }
      });
      clsObserver.observe({ entryTypes: ['layout-shift'] });
      this.observers.push(clsObserver);

      // Long Tasks
      const longTaskObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          this.recordLongTask(entry);
        }
      });
      longTaskObserver.observe({ entryTypes: ['longtask'] });
      this.observers.push(longTaskObserver);
    }
  }

  recordNavigationMetrics(entry) {
    const metrics = {
      dns_lookup: entry.domainLookupEnd - entry.domainLookupStart,
      tcp_connection: entry.connectEnd - entry.connectStart,
      tls_handshake: entry.secureConnectionStart > 0 ?
        entry.connectEnd - entry.secureConnectionStart : 0,
      request_time: entry.responseStart - entry.requestStart,
      response_time: entry.responseEnd - entry.responseStart,
      dom_loading: entry.domContentLoadedEventStart - entry.responseEnd,
      dom_interactive: entry.domInteractive - entry.navigationStart,
      dom_complete: entry.domComplete - entry.navigationStart,
      page_load: entry.loadEventEnd - entry.navigationStart
    };

    Object.entries(metrics).forEach(([name, value]) => {
      this.recordMetric(`navigation.${name}`, value);
    });
  }

  recordPaintMetrics(entry) {
    this.recordMetric(`paint.${entry.name.replace('-', '_')}`, entry.startTime);
  }

  recordLongTask(entry) {
    this.recordMetric('long_task', entry.duration);

    // Alert on very long tasks
    if (entry.duration > 500) {
      console.warn('Very long task detected:', {
        duration: entry.duration,
        startTime: entry.startTime
      });
    }
  }

  startTimer(name) {
    this.startTimes.set(name, performance.now());
  }

  endTimer(name, metadata = {}) {
    const startTime = this.startTimes.get(name);
    if (startTime) {
      const duration = performance.now() - startTime;
      this.recordMetric(name, duration, metadata);
      this.startTimes.delete(name);
      return duration;
    }
    return null;
  }

  recordMetric(name, value, metadata = {}) {
    if (!this.isEnabled) return;

    const metric = {
      name,
      value,
      timestamp: Date.now(),
      url: typeof window !== 'undefined' ? window.location.pathname : null,
      userAgent: typeof navigator !== 'undefined' ? navigator.userAgent : null,
      ...metadata
    };

    // Store locally
    if (!this.metrics.has(name)) {
      this.metrics.set(name, []);
    }
    this.metrics.get(name).push(metric);

    // Send to monitoring service
    this.sendMetric(metric);
  }

  recordChessMetrics(gameId, moveCount, gameTime) {
    this.recordMetric('chess.game_duration', gameTime, { gameId, moveCount });
    this.recordMetric('chess.moves_per_minute', (moveCount / gameTime) * 60000, { gameId });
  }

  recordUserAction(action, metadata = {}) {
    this.recordMetric(`user.${action}`, 1, {
      ...metadata,
      timestamp: Date.now()
    });
  }

  async sendMetric(metric) {
    try {
      await fetch('/api/metrics', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify(metric)
      });
    } catch (error) {
      // Fail silently in production
      if (process.env.NODE_ENV === 'development') {
        console.error('Failed to send metric:', error);
      }
    }
  }

  getMetrics() {
    return Object.fromEntries(this.metrics);
  }

  clearMetrics() {
    this.metrics.clear();
  }

  destroy() {
    this.observers.forEach(observer => observer.disconnect());
    this.observers = [];
  }
}

export const performanceTracker = new PerformanceTracker();

Real User Monitoring (RUM)

// shared/monitoring/RumTracker.js
export class RumTracker {
  constructor() {
    this.sessionId = this.generateSessionId();
    this.userId = null;
    this.pageViews = [];
    this.errors = [];
    this.interactions = [];

    this.initializeTracking();
  }

  initializeTracking() {
    if (typeof window === 'undefined') return;

    // Track page views
    this.trackPageView();

    // Track user interactions
    this.trackInteractions();

    // Track errors
    this.trackErrors();

    // Track resource loading
    this.trackResources();

    // Send data periodically
    this.startBatching();
  }

  generateSessionId() {
    return `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
  }

  setUserId(userId) {
    this.userId = userId;
  }

  trackPageView() {
    const pageView = {
      sessionId: this.sessionId,
      userId: this.userId,
      url: window.location.href,
      title: document.title,
      referrer: document.referrer,
      timestamp: Date.now(),
      viewport: {
        width: window.innerWidth,
        height: window.innerHeight
      },
      screen: {
        width: screen.width,
        height: screen.height,
        pixelRatio: window.devicePixelRatio
      },
      connection: navigator.connection ? {
        effectiveType: navigator.connection.effectiveType,
        downlink: navigator.connection.downlink,
        rtt: navigator.connection.rtt
      } : null
    };

    this.pageViews.push(pageView);
    this.sendEvent('page_view', pageView);
  }

  trackInteractions() {
    // Track clicks
    document.addEventListener('click', (event) => {
      this.recordInteraction('click', event);
    });

    // Track form submissions
    document.addEventListener('submit', (event) => {
      this.recordInteraction('form_submit', event);
    });

    // Track chess-specific interactions
    document.addEventListener('chess-move', (event) => {
      this.recordChessInteraction('move', event.detail);
    });

    document.addEventListener('chess-game-start', (event) => {
      this.recordChessInteraction('game_start', event.detail);
    });

    document.addEventListener('chess-game-end', (event) => {
      this.recordChessInteraction('game_end', event.detail);
    });
  }

  recordInteraction(type, event) {
    const interaction = {
      sessionId: this.sessionId,
      userId: this.userId,
      type,
      timestamp: Date.now(),
      element: this.getElementInfo(event.target),
      coordinates: event.clientX ? {
        x: event.clientX,
        y: event.clientY
      } : null
    };

    this.interactions.push(interaction);
  }

  recordChessInteraction(type, details) {
    const interaction = {
      sessionId: this.sessionId,
      userId: this.userId,
      type: `chess_${type}`,
      timestamp: Date.now(),
      details
    };

    this.interactions.push(interaction);
    this.sendEvent('chess_interaction', interaction);
  }

  getElementInfo(element) {
    return {
      tagName: element.tagName,
      className: element.className,
      id: element.id,
      textContent: element.textContent?.substring(0, 100)
    };
  }

  trackErrors() {
    window.addEventListener('error', (event) => {
      this.recordError('javascript', {
        message: event.message,
        filename: event.filename,
        lineno: event.lineno,
        colno: event.colno,
        stack: event.error?.stack
      });
    });

    window.addEventListener('unhandledrejection', (event) => {
      this.recordError('promise', {
        reason: event.reason?.toString(),
        stack: event.reason?.stack
      });
    });
  }

  recordError(type, details) {
    const error = {
      sessionId: this.sessionId,
      userId: this.userId,
      type,
      timestamp: Date.now(),
      url: window.location.href,
      userAgent: navigator.userAgent,
      ...details
    };

    this.errors.push(error);
    this.sendEvent('error', error);
  }

  trackResources() {
    if ('PerformanceObserver' in window) {
      const resourceObserver = new PerformanceObserver((list) => {
        for (const entry of list.getEntries()) {
          this.recordResourceTiming(entry);
        }
      });
      resourceObserver.observe({ entryTypes: ['resource'] });
    }
  }

  recordResourceTiming(entry) {
    const resource = {
      sessionId: this.sessionId,
      name: entry.name,
      type: entry.initiatorType,
      size: entry.transferSize,
      duration: entry.duration,
      timestamp: Date.now()
    };

    // Only track slow resources
    if (entry.duration > 1000) {
      this.sendEvent('slow_resource', resource);
    }
  }

  startBatching() {
    setInterval(() => {
      this.sendBatch();
    }, 30000); // Send every 30 seconds

    // Send on page unload
    window.addEventListener('beforeunload', () => {
      this.sendBatch();
    });
  }

  sendBatch() {
    const data = {
      sessionId: this.sessionId,
      userId: this.userId,
      timestamp: Date.now(),
      pageViews: [...this.pageViews],
      interactions: [...this.interactions],
      errors: [...this.errors]
    };

    if (data.pageViews.length || data.interactions.length || data.errors.length) {
      this.sendEvent('batch', data);

      // Clear sent data
      this.pageViews = [];
      this.interactions = [];
      this.errors = [];
    }
  }

  async sendEvent(type, data) {
    try {
      await fetch('/api/rum', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify({ type, data })
      });
    } catch (error) {
      // Fail silently
    }
  }
}

export const rumTracker = new RumTracker();

Error Tracking and Alerting

Error Monitoring System

// shared/monitoring/ErrorTracker.js
export class ErrorTracker {
  constructor() {
    this.errors = [];
    this.errorCounts = new Map();
    this.alertThresholds = {
      javascript: 5,
      network: 10,
      chess_engine: 3,
      websocket: 5
    };

    this.setupErrorHandling();
  }

  setupErrorHandling() {
    if (typeof window === 'undefined') return;

    // Global error handler
    window.addEventListener('error', (event) => {
      this.captureError({
        type: 'javascript',
        message: event.message,
        filename: event.filename,
        lineno: event.lineno,
        colno: event.colno,
        stack: event.error?.stack,
        timestamp: Date.now()
      });
    });

    // Unhandled promise rejections
    window.addEventListener('unhandledrejection', (event) => {
      this.captureError({
        type: 'promise',
        message: event.reason?.toString(),
        stack: event.reason?.stack,
        timestamp: Date.now()
      });
    });

    // Network errors
    this.interceptFetch();
  }

  interceptFetch() {
    const originalFetch = window.fetch;

    window.fetch = async (...args) => {
      try {
        const response = await originalFetch(...args);

        if (!response.ok) {
          this.captureError({
            type: 'network',
            message: `HTTP ${response.status}: ${response.statusText}`,
            url: args[0],
            status: response.status,
            timestamp: Date.now()
          });
        }

        return response;
      } catch (error) {
        this.captureError({
          type: 'network',
          message: error.message,
          url: args[0],
          timestamp: Date.now()
        });
        throw error;
      }
    };
  }

  captureError(error) {
    // Enrich error with context
    const enrichedError = {
      ...error,
      id: this.generateErrorId(),
      url: window.location.href,
      userAgent: navigator.userAgent,
      userId: this.getCurrentUserId(),
      sessionId: this.getSessionId(),
      context: this.getErrorContext()
    };

    this.errors.push(enrichedError);
    this.updateErrorCounts(error.type);
    this.checkAlertThresholds(error.type);

    // Send to error tracking service
    this.sendError(enrichedError);

    console.error('Error captured:', enrichedError);
  }

  generateErrorId() {
    return `error_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
  }

  getCurrentUserId() {
    // Get from your auth system
    return localStorage.getItem('user-id') || 'anonymous';
  }

  getSessionId() {
    // Get from your session system
    return sessionStorage.getItem('session-id') || 'unknown';
  }

  getErrorContext() {
    return {
      route: window.location.pathname,
      gameState: this.getGameState(),
      userActions: this.getRecentUserActions(),
      performanceMetrics: this.getBasicPerformanceMetrics()
    };
  }

  getGameState() {
    // Get current chess game state if available
    try {
      const gameElement = document.querySelector('[data-game-id]');
      return gameElement ? {
        gameId: gameElement.dataset.gameId,
        moveCount: gameElement.dataset.moveCount,
        currentPlayer: gameElement.dataset.currentPlayer
      } : null;
    } catch {
      return null;
    }
  }

  getRecentUserActions() {
    // Get recent user actions from RUM tracker
    return rumTracker?.interactions?.slice(-5) || [];
  }

  getBasicPerformanceMetrics() {
    return {
      memory: performance.memory ? {
        usedJSHeapSize: performance.memory.usedJSHeapSize,
        totalJSHeapSize: performance.memory.totalJSHeapSize
      } : null,
      timing: performance.timing ? {
        loadTime: performance.timing.loadEventEnd - performance.timing.navigationStart
      } : null
    };
  }

  updateErrorCounts(errorType) {
    const key = `${errorType}:${Date.now() - (Date.now() % 60000)}`; // Per minute
    this.errorCounts.set(key, (this.errorCounts.get(key) || 0) + 1);

    // Clean old counts
    this.cleanOldCounts();
  }

  cleanOldCounts() {
    const tenMinutesAgo = Date.now() - (10 * 60 * 1000);

    for (const [key] of this.errorCounts) {
      const timestamp = parseInt(key.split(':')[1]);
      if (timestamp < tenMinutesAgo) {
        this.errorCounts.delete(key);
      }
    }
  }

  checkAlertThresholds(errorType) {
    const threshold = this.alertThresholds[errorType];
    if (!threshold) return;

    const recentCounts = Array.from(this.errorCounts.entries())
      .filter(([key]) => key.startsWith(`${errorType}:`))
      .reduce((sum, [, count]) => sum + count, 0);

    if (recentCounts >= threshold) {
      this.triggerAlert(errorType, recentCounts, threshold);
    }
  }

  triggerAlert(errorType, count, threshold) {
    const alert = {
      type: 'error_threshold_exceeded',
      errorType,
      count,
      threshold,
      timestamp: Date.now(),
      severity: count >= threshold * 2 ? 'critical' : 'warning'
    };

    this.sendAlert(alert);
    console.warn('Error threshold exceeded:', alert);
  }

  async sendError(error) {
    try {
      await fetch('/api/errors', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify(error)
      });
    } catch (e) {
      // Fail silently
    }
  }

  async sendAlert(alert) {
    try {
      await fetch('/api/alerts', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify(alert)
      });
    } catch (e) {
      // Fail silently
    }
  }

  getErrorSummary() {
    const summary = {
      totalErrors: this.errors.length,
      errorsByType: {},
      recentErrors: this.errors.slice(-10)
    };

    this.errors.forEach(error => {
      summary.errorsByType[error.type] = (summary.errorsByType[error.type] || 0) + 1;
    });

    return summary;
  }

  clearErrors() {
    this.errors = [];
    this.errorCounts.clear();
  }
}

export const errorTracker = new ErrorTracker();

Infrastructure Monitoring

System Metrics Collection

// backend/monitoring/SystemMonitor.js
const os = require('os');
const process = require('process');
const { EventEmitter } = require('events');

class SystemMonitor extends EventEmitter {
  constructor() {
    super();
    this.metrics = new Map();
    this.interval = null;
    this.isRunning = false;
  }

  start(intervalMs = 5000) {
    if (this.isRunning) return;

    this.isRunning = true;
    this.interval = setInterval(() => {
      this.collectMetrics();
    }, intervalMs);

    console.log('System monitoring started');
  }

  stop() {
    if (this.interval) {
      clearInterval(this.interval);
      this.interval = null;
    }
    this.isRunning = false;
    console.log('System monitoring stopped');
  }

  collectMetrics() {
    const timestamp = Date.now();

    const metrics = {
      timestamp,
      system: this.getSystemMetrics(),
      process: this.getProcessMetrics(),
      nodejs: this.getNodeJSMetrics(),
      custom: this.getCustomMetrics()
    };

    this.metrics.set(timestamp, metrics);
    this.emit('metrics', metrics);

    // Keep only last 100 metric points
    if (this.metrics.size > 100) {
      const oldestKey = this.metrics.keys().next().value;
      this.metrics.delete(oldestKey);
    }

    // Check for alerts
    this.checkAlerts(metrics);
  }

  getSystemMetrics() {
    const loadAvg = os.loadavg();
    const totalMem = os.totalmem();
    const freeMem = os.freemem();

    return {
      hostname: os.hostname(),
      platform: os.platform(),
      arch: os.arch(),
      uptime: os.uptime(),
      loadAverage: {
        '1m': loadAvg[0],
        '5m': loadAvg[1],
        '15m': loadAvg[2]
      },
      memory: {
        total: totalMem,
        free: freeMem,
        used: totalMem - freeMem,
        usagePercent: ((totalMem - freeMem) / totalMem) * 100
      },
      cpus: os.cpus().length
    };
  }

  getProcessMetrics() {
    const memUsage = process.memoryUsage();
    const cpuUsage = process.cpuUsage();

    return {
      pid: process.pid,
      uptime: process.uptime(),
      memory: {
        rss: memUsage.rss,
        heapTotal: memUsage.heapTotal,
        heapUsed: memUsage.heapUsed,
        external: memUsage.external,
        arrayBuffers: memUsage.arrayBuffers
      },
      cpu: {
        user: cpuUsage.user,
        system: cpuUsage.system
      }
    };
  }

  getNodeJSMetrics() {
    return {
      version: process.version,
      versions: process.versions,
      env: process.env.NODE_ENV
    };
  }

  getCustomMetrics() {
    // Add application-specific metrics
    return {
      activeGames: this.getActiveGameCount(),
      connectedUsers: this.getConnectedUserCount(),
      websocketConnections: this.getWebSocketConnectionCount(),
      gameEngineRequests: this.getGameEngineRequestCount()
    };
  }

  getActiveGameCount() {
    // Implement based on your game state management
    return global.gameManager?.getActiveGameCount() || 0;
  }

  getConnectedUserCount() {
    // Implement based on your user management
    return global.userManager?.getConnectedUserCount() || 0;
  }

  getWebSocketConnectionCount() {
    // Implement based on your WebSocket management
    return global.wsManager?.getConnectionCount() || 0;
  }

  getGameEngineRequestCount() {
    // Implement based on your game engine
    return global.chessEngine?.getRequestCount() || 0;
  }

  checkAlerts(metrics) {
    const alerts = [];

    // High memory usage
    if (metrics.system.memory.usagePercent > 90) {
      alerts.push({
        type: 'high_memory_usage',
        severity: 'critical',
        value: metrics.system.memory.usagePercent,
        threshold: 90
      });
    }

    // High load average
    if (metrics.system.loadAverage['1m'] > metrics.system.cpus * 2) {
      alerts.push({
        type: 'high_load_average',
        severity: 'warning',
        value: metrics.system.loadAverage['1m'],
        threshold: metrics.system.cpus * 2
      });
    }

    // High heap usage
    const heapUsagePercent = (metrics.process.memory.heapUsed / metrics.process.memory.heapTotal) * 100;
    if (heapUsagePercent > 85) {
      alerts.push({
        type: 'high_heap_usage',
        severity: 'warning',
        value: heapUsagePercent,
        threshold: 85
      });
    }

    if (alerts.length > 0) {
      this.emit('alerts', alerts);
    }
  }

  getMetricsSummary(timeRangeMs = 300000) { // Last 5 minutes
    const cutoff = Date.now() - timeRangeMs;
    const recentMetrics = Array.from(this.metrics.entries())
      .filter(([timestamp]) => timestamp >= cutoff)
      .map(([, metrics]) => metrics);

    if (recentMetrics.length === 0) return null;

    return {
      count: recentMetrics.length,
      timeRange: timeRangeMs,
      averages: this.calculateAverages(recentMetrics),
      latest: recentMetrics[recentMetrics.length - 1]
    };
  }

  calculateAverages(metrics) {
    const count = metrics.length;

    return {
      memoryUsagePercent: metrics.reduce((sum, m) => sum + m.system.memory.usagePercent, 0) / count,
      loadAverage1m: metrics.reduce((sum, m) => sum + m.system.loadAverage['1m'], 0) / count,
      heapUsed: metrics.reduce((sum, m) => sum + m.process.memory.heapUsed, 0) / count,
      activeGames: metrics.reduce((sum, m) => sum + m.custom.activeGames, 0) / count,
      connectedUsers: metrics.reduce((sum, m) => sum + m.custom.connectedUsers, 0) / count
    };
  }
}

module.exports = SystemMonitor;

Docker Container Monitoring

# docker-compose.monitoring.yml
version: '3.8'
services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards
      - ./monitoring/grafana/datasources:/etc/grafana/provisioning/datasources
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123

  node-exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro

volumes:
  prometheus_data:
  grafana_data:

Log Aggregation and Analysis

Structured Logging

// shared/logging/Logger.js
export class Logger {
  constructor(context = '') {
    this.context = context;
    this.logLevel = process.env.LOG_LEVEL || 'info';
    this.transports = [];

    this.setupTransports();
  }

  setupTransports() {
    // Console transport (always enabled)
    this.transports.push(new ConsoleTransport());

    // File transport (in production)
    if (process.env.NODE_ENV === 'production') {
      this.transports.push(new FileTransport());
    }

    // Remote transport (if configured)
    if (process.env.LOG_ENDPOINT) {
      this.transports.push(new RemoteTransport(process.env.LOG_ENDPOINT));
    }
  }

  log(level, message, metadata = {}) {
    const logEntry = {
      timestamp: new Date().toISOString(),
      level,
      message,
      context: this.context,
      metadata,
      requestId: this.getRequestId(),
      userId: this.getUserId(),
      sessionId: this.getSessionId()
    };

    // Check log level
    if (!this.shouldLog(level)) return;

    // Send to all transports
    this.transports.forEach(transport => {
      transport.log(logEntry);
    });
  }

  shouldLog(level) {
    const levels = { error: 0, warn: 1, info: 2, debug: 3 };
    return levels[level] <= levels[this.logLevel];
  }

  getRequestId() {
    // Get from async context or headers
    return global.requestId || null;
  }

  getUserId() {
    // Get from session or token
    return global.userId || null;
  }

  getSessionId() {
    // Get from session
    return global.sessionId || null;
  }

  error(message, error = null, metadata = {}) {
    this.log('error', message, {
      ...metadata,
      error: error ? {
        name: error.name,
        message: error.message,
        stack: error.stack
      } : null
    });
  }

  warn(message, metadata = {}) {
    this.log('warn', message, metadata);
  }

  info(message, metadata = {}) {
    this.log('info', message, metadata);
  }

  debug(message, metadata = {}) {
    this.log('debug', message, metadata);
  }

  // Chess-specific logging methods
  logGameEvent(event, gameId, metadata = {}) {
    this.info(`Chess game event: ${event}`, {
      gameId,
      event,
      ...metadata
    });
  }

  logMoveAttempt(gameId, move, isValid, metadata = {}) {
    this.info('Chess move attempt', {
      gameId,
      move,
      isValid,
      ...metadata
    });
  }

  logAiRequest(gameId, requestType, duration, metadata = {}) {
    this.info('AI request completed', {
      gameId,
      requestType,
      duration,
      ...metadata
    });
  }

  child(additionalContext) {
    const childLogger = new Logger(`${this.context}:${additionalContext}`);
    childLogger.transports = this.transports;
    return childLogger;
  }
}

class ConsoleTransport {
  log(entry) {
    const { level, message, context, metadata } = entry;
    const contextStr = context ? `[${context}]` : '';
    const metadataStr = Object.keys(metadata).length > 0 ?
      `\n${JSON.stringify(metadata, null, 2)}` : '';

    console[level](`${entry.timestamp} ${level.toUpperCase()} ${contextStr} ${message}${metadataStr}`);
  }
}

class FileTransport {
  constructor() {
    this.fs = require('fs');
    this.path = require('path');
    this.logDir = process.env.LOG_DIR || './logs';
    this.ensureLogDir();
  }

  ensureLogDir() {
    if (!this.fs.existsSync(this.logDir)) {
      this.fs.mkdirSync(this.logDir, { recursive: true });
    }
  }

  log(entry) {
    const date = new Date().toISOString().split('T')[0];
    const filename = this.path.join(this.logDir, `${date}.log`);
    const logLine = JSON.stringify(entry) + '\n';

    this.fs.appendFileSync(filename, logLine);
  }
}

class RemoteTransport {
  constructor(endpoint) {
    this.endpoint = endpoint;
    this.buffer = [];
    this.flushInterval = 5000; // 5 seconds
    this.startFlushing();
  }

  log(entry) {
    this.buffer.push(entry);
  }

  startFlushing() {
    setInterval(() => {
      this.flush();
    }, this.flushInterval);
  }

  async flush() {
    if (this.buffer.length === 0) return;

    const logs = [...this.buffer];
    this.buffer = [];

    try {
      await fetch(this.endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify({ logs })
      });
    } catch (error) {
      // Put logs back in buffer if failed
      this.buffer.unshift(...logs);
      console.error('Failed to send logs to remote endpoint:', error);
    }
  }
}

export const logger = new Logger('chess-app');

Dashboard and Visualization

Grafana Dashboard Configuration

{
  "dashboard": {
    "id": null,
    "title": "Chess Application Monitoring",
    "tags": ["chess", "application"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "Active Games",
        "type": "stat",
        "targets": [
          {
            "expr": "chess_active_games",
            "legendFormat": "Active Games"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "color": {
              "mode": "thresholds"
            },
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 50},
                {"color": "red", "value": 100}
              ]
            }
          }
        }
      },
      {
        "id": 2,
        "title": "Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])",
            "legendFormat": "Average Response Time"
          }
        ],
        "yAxes": [
          {
            "label": "Response Time (ms)",
            "min": 0
          }
        ]
      },
      {
        "id": 3,
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100",
            "legendFormat": "Error Rate %"
          }
        ],
        "alert": {
          "conditions": [
            {
              "query": {
                "queryType": "",
                "refId": "A"
              },
              "reducer": {
                "type": "last",
                "params": []
              },
              "evaluator": {
                "params": [5],
                "type": "gt"
              }
            }
          ],
          "executionErrorState": "alerting",
          "for": "5m",
          "frequency": "10s",
          "handler": 1,
          "name": "High Error Rate",
          "noDataState": "no_data",
          "notifications": []
        }
      },
      {
        "id": 4,
        "title": "System Resources",
        "type": "graph",
        "targets": [
          {
            "expr": "100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)",
            "legendFormat": "Memory Usage %"
          },
          {
            "expr": "100 - (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * 100)",
            "legendFormat": "CPU Usage %"
          }
        ]
      },
      {
        "id": 5,
        "title": "WebSocket Connections",
        "type": "stat",
        "targets": [
          {
            "expr": "chess_websocket_connections",
            "legendFormat": "Active Connections"
          }
        ]
      },
      {
        "id": 6,
        "title": "Game Engine Performance",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(chess_engine_request_duration_bucket[5m]))",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.50, rate(chess_engine_request_duration_bucket[5m]))",
            "legendFormat": "50th percentile"
          }
        ]
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "5s"
  }
}

Uptime Monitoring

Health Check System

// backend/monitoring/HealthChecker.js
class HealthChecker {
  constructor() {
    this.checks = new Map();
    this.results = new Map();
    this.interval = null;
  }

  addCheck(name, checkFunction, options = {}) {
    this.checks.set(name, {
      function: checkFunction,
      timeout: options.timeout || 5000,
      critical: options.critical || false,
      interval: options.interval || 30000
    });
  }

  async runAllChecks() {
    const results = new Map();
    const promises = [];

    for (const [name, check] of this.checks) {
      promises.push(this.runSingleCheck(name, check));
    }

    const checkResults = await Promise.allSettled(promises);

    checkResults.forEach((result, index) => {
      const checkName = Array.from(this.checks.keys())[index];
      results.set(checkName, result.status === 'fulfilled' ? result.value : {
        status: 'error',
        error: result.reason.message,
        timestamp: Date.now()
      });
    });

    this.results = results;
    return this.getHealthStatus();
  }

  async runSingleCheck(name, check) {
    const startTime = Date.now();

    try {
      const timeoutPromise = new Promise((_, reject) => {
        setTimeout(() => reject(new Error('Health check timeout')), check.timeout);
      });

      const result = await Promise.race([
        check.function(),
        timeoutPromise
      ]);

      return {
        status: 'healthy',
        duration: Date.now() - startTime,
        result,
        timestamp: Date.now()
      };
    } catch (error) {
      return {
        status: 'unhealthy',
        duration: Date.now() - startTime,
        error: error.message,
        timestamp: Date.now()
      };
    }
  }

  getHealthStatus() {
    const overall = {
      status: 'healthy',
      timestamp: Date.now(),
      checks: Object.fromEntries(this.results)
    };

    // Check if any critical services are down
    for (const [name, result] of this.results) {
      const check = this.checks.get(name);
      if (check.critical && result.status !== 'healthy') {
        overall.status = 'unhealthy';
        break;
      }
    }

    // If no critical failures, check for any failures
    if (overall.status === 'healthy') {
      for (const [, result] of this.results) {
        if (result.status !== 'healthy') {
          overall.status = 'degraded';
          break;
        }
      }
    }

    return overall;
  }

  setupDefaultChecks() {
    // Database connectivity
    this.addCheck('database', async () => {
      // Replace with your database check
      const result = await this.checkDatabase();
      return { connected: result };
    }, { critical: true });

    // Redis connectivity
    this.addCheck('redis', async () => {
      const result = await this.checkRedis();
      return { connected: result };
    }, { critical: false });

    // Chess engine
    this.addCheck('chess_engine', async () => {
      const result = await this.checkChessEngine();
      return { responding: result };
    }, { critical: true });

    // External API
    this.addCheck('external_api', async () => {
      const result = await this.checkExternalAPI();
      return { available: result };
    }, { critical: false });

    // File system
    this.addCheck('filesystem', async () => {
      const result = await this.checkFileSystem();
      return { writable: result };
    }, { critical: true });
  }

  async checkDatabase() {
    // Implement database connectivity check
    try {
      // Example: await db.query('SELECT 1');
      return true;
    } catch (error) {
      throw new Error(`Database check failed: ${error.message}`);
    }
  }

  async checkRedis() {
    // Implement Redis connectivity check
    try {
      // Example: await redis.ping();
      return true;
    } catch (error) {
      throw new Error(`Redis check failed: ${error.message}`);
    }
  }

  async checkChessEngine() {
    // Implement chess engine check
    try {
      // Example: await chessEngine.validateMove('e2e4');
      return true;
    } catch (error) {
      throw new Error(`Chess engine check failed: ${error.message}`);
    }
  }

  async checkExternalAPI() {
    // Implement external API check
    try {
      const response = await fetch('https://api.example.com/health');
      return response.ok;
    } catch (error) {
      throw new Error(`External API check failed: ${error.message}`);
    }
  }

  async checkFileSystem() {
    // Implement file system check
    try {
      const fs = require('fs').promises;
      const testFile = '/tmp/health-check';
      await fs.writeFile(testFile, 'test');
      await fs.unlink(testFile);
      return true;
    } catch (error) {
      throw new Error(`File system check failed: ${error.message}`);
    }
  }

  start() {
    this.setupDefaultChecks();

    // Run checks immediately
    this.runAllChecks();

    // Set up periodic checks
    this.interval = setInterval(() => {
      this.runAllChecks();
    }, 30000); // Every 30 seconds
  }

  stop() {
    if (this.interval) {
      clearInterval(this.interval);
      this.interval = null;
    }
  }
}

module.exports = HealthChecker;

Next Steps

  • Troubleshooting - Monitoring-based troubleshooting guide
  • Security - Security monitoring and incident response
  • Performance - Performance optimization based on monitoring data