Production Deployment - ruvnet/ruv-FANN GitHub Wiki
This guide provides comprehensive instructions for deploying RUV-FANN (Recurrent Universal Value - Fractal Artificial Neural Network) in production environments, focusing on enterprise-grade scalability, reliability, and performance.
- Deployment Architecture
- Container Orchestration
- Load Balancing
- High Availability
- Disaster Recovery
- Performance Tuning
- Security Considerations
- Monitoring and Observability
- Scaling Strategies
- Troubleshooting
┌─────────────────────────────────────────────────────────────────┐
│ Load Balancer Layer │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ ALB/NLB │ │ CloudFlare │ │ HAProxy │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
│
┌─────────────────────────────────────────────────────────────────┐
│ Application Layer │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ RUV-FANN API │ │ RUV-FANN API │ │ RUV-FANN API │ │
│ │ Instance 1 │ │ Instance 2 │ │ Instance N │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
│
┌─────────────────────────────────────────────────────────────────┐
│ Data Layer │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ PostgreSQL │ │ Redis │ │ S3/MinIO │ │
│ │ Cluster │ │ Cache │ │ Object Store │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
- API Gateway: Route requests, rate limiting, authentication
- Application Servers: RUV-FANN compute instances
- Database Cluster: Primary/replica PostgreSQL setup
- Cache Layer: Redis cluster for session and computational caching
- Object Storage: Model artifacts, training data, backups
- Message Queue: RabbitMQ/Apache Kafka for async processing
- Monitoring Stack: Prometheus, Grafana, Jaeger
- Log Aggregation: ELK Stack (Elasticsearch, Logstash, Kibana)
- Secret Management: HashiCorp Vault or AWS Secrets Manager
apiVersion: v1
kind: Namespace
metadata:
name: ruv-fann-prod
labels:
env: production
app: ruv-fann
apiVersion: apps/v1
kind: Deployment
metadata:
name: ruv-fann-api
namespace: ruv-fann-prod
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: ruv-fann-api
template:
metadata:
labels:
app: ruv-fann-api
version: "1.0.0"
spec:
containers:
- name: ruv-fann
image: ruv-fann:latest
ports:
- containerPort: 8080
name: http
- containerPort: 9090
name: metrics
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: ruv-fann-secrets
key: database-url
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: ruv-fann-secrets
key: redis-url
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
apiVersion: v1
kind: Service
metadata:
name: ruv-fann-service
namespace: ruv-fann-prod
spec:
selector:
app: ruv-fann-api
ports:
- name: http
port: 80
targetPort: 8080
- name: metrics
port: 9090
targetPort: 9090
type: ClusterIP
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ruv-fann-hpa
namespace: ruv-fann-prod
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ruv-fann-api
minReplicas: 3
maxReplicas: 50
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
# Build stage
FROM rust:1.75 AS builder
WORKDIR /app
COPY . .
RUN cargo build --release --bin ruv-fann
# Runtime stage
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y \
ca-certificates \
libssl3 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY --from=builder /app/target/release/ruv-fann /usr/local/bin/
COPY --from=builder /app/config /app/config
EXPOSE 8080 9090
USER 1000:1000
CMD ["ruv-fann"]
version: '3.8'
services:
ruv-fann:
build: .
ports:
- "8080:8080"
- "9090:9090"
environment:
- DATABASE_URL=postgresql://user:pass@postgres:5432/ruv_fann
- REDIS_URL=redis://redis:6379
depends_on:
- postgres
- redis
postgres:
image: postgres:15
environment:
POSTGRES_DB: ruv_fann
POSTGRES_USER: user
POSTGRES_PASSWORD: pass
volumes:
- postgres_data:/var/lib/postgresql/data
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
volumes:
postgres_data:
redis_data:
apiVersion: v1
kind: Service
metadata:
name: ruv-fann-alb
namespace: ruv-fann-prod
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
spec:
type: LoadBalancer
selector:
app: ruv-fann-api
ports:
- port: 80
targetPort: 8080
protocol: TCP
global
daemon
maxconn 4096
log stdout local0
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
option dontlognull
frontend ruv_fann_frontend
bind *:80
bind *:443 ssl crt /etc/ssl/certs/ruv-fann.pem
redirect scheme https if !{ ssl_fc }
# Health check endpoint
acl health_check path_beg /health
use_backend health_backend if health_check
default_backend ruv_fann_backend
backend ruv_fann_backend
balance roundrobin
option httpchk GET /health
http-check expect status 200
server app1 ruv-fann-1:8080 check
server app2 ruv-fann-2:8080 check
server app3 ruv-fann-3:8080 check
backend health_backend
server health 127.0.0.1:8080 check
upstream ruv_fann_backend {
least_conn;
server ruv-fann-1:8080 weight=1 max_fails=3 fail_timeout=30s;
server ruv-fann-2:8080 weight=1 max_fails=3 fail_timeout=30s;
server ruv-fann-3:8080 weight=1 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
listen 443 ssl http2;
server_name ruv-fann.example.com;
ssl_certificate /etc/ssl/certs/ruv-fann.crt;
ssl_certificate_key /etc/ssl/private/ruv-fann.key;
# Security headers
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
location / {
proxy_pass http://ruv_fann_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeouts
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
location /health {
access_log off;
proxy_pass http://ruv_fann_backend;
}
}
- 3 Availability Zones
- Application instances across all AZs
- Database primary in AZ-1, replicas in AZ-2 and AZ-3
- Redis cluster with sharding
- 2 Availability Zones
- Read replicas for database
- Standby application instances (auto-scaling disabled)
- Cross-region backup replication
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: ruv-fann-postgres
namespace: ruv-fann-prod
spec:
instances: 3
postgresql:
parameters:
max_connections: "200"
shared_buffers: "256MB"
effective_cache_size: "1GB"
maintenance_work_mem: "64MB"
checkpoint_completion_target: "0.9"
wal_buffers: "16MB"
default_statistics_target: "100"
random_page_cost: "1.1"
effective_io_concurrency: "200"
bootstrap:
initdb:
database: ruv_fann
owner: ruv_fann_user
secret:
name: ruv-fann-db-credentials
storage:
size: 100Gi
storageClass: fast-ssd
monitoring:
enabled: true
prometheusRule:
enabled: true
apiVersion: v1
kind: ConfigMap
metadata:
name: redis-cluster-config
namespace: ruv-fann-prod
data:
redis.conf: |
cluster-enabled yes
cluster-config-file nodes.conf
cluster-node-timeout 5000
appendonly yes
appendfsync everysec
save 900 1
save 300 10
save 60 10000
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis-cluster
namespace: ruv-fann-prod
spec:
serviceName: redis-cluster
replicas: 6
selector:
matchLabels:
app: redis-cluster
template:
metadata:
labels:
app: redis-cluster
spec:
containers:
- name: redis
image: redis:7-alpine
command:
- redis-server
- /conf/redis.conf
ports:
- containerPort: 6379
name: client
- containerPort: 16379
name: gossip
volumeMounts:
- name: conf
mountPath: /conf
- name: data
mountPath: /data
volumes:
- name: conf
configMap:
name: redis-cluster-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 10Gi
use circuit_breaker::CircuitBreaker;
use std::time::Duration;
pub struct ResilientDatabase {
pool: DatabasePool,
circuit_breaker: CircuitBreaker,
}
impl ResilientDatabase {
pub fn new(pool: DatabasePool) -> Self {
let circuit_breaker = CircuitBreaker::builder()
.failure_threshold(5)
.timeout(Duration::from_secs(10))
.reset_timeout(Duration::from_secs(30))
.build();
Self { pool, circuit_breaker }
}
pub async fn execute_query(&self, query: &str) -> Result<QueryResult, DatabaseError> {
self.circuit_breaker.call(|| async {
self.pool.execute(query).await
}).await
}
}
apiVersion: batch/v1
kind: CronJob
metadata:
name: database-backup
namespace: ruv-fann-prod
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: postgres-backup
image: postgres:15
command:
- /bin/bash
- -c
- |
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
pg_dump $DATABASE_URL | gzip > /backup/ruv_fann_backup_$TIMESTAMP.sql.gz
aws s3 cp /backup/ruv_fann_backup_$TIMESTAMP.sql.gz s3://ruv-fann-backups/
find /backup -name "*.sql.gz" -mtime +7 -delete
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: ruv-fann-secrets
key: database-url
volumeMounts:
- name: backup-storage
mountPath: /backup
volumes:
- name: backup-storage
emptyDir: {}
restartPolicy: OnFailure
apiVersion: v1
kind: ConfigMap
metadata:
name: backup-replication-config
data:
backup-script.sh: |
#!/bin/bash
set -e
# Primary region backup
BACKUP_FILE="ruv_fann_$(date +%Y%m%d_%H%M%S).sql.gz"
pg_dump $DATABASE_URL | gzip > /tmp/$BACKUP_FILE
# Upload to primary region
aws s3 cp /tmp/$BACKUP_FILE s3://ruv-fann-backups-primary/
# Replicate to secondary region
aws s3 cp s3://ruv-fann-backups-primary/$BACKUP_FILE s3://ruv-fann-backups-secondary/
# Verify backup integrity
aws s3api head-object --bucket ruv-fann-backups-secondary --key $BACKUP_FILE
# Cleanup local file
rm /tmp/$BACKUP_FILE
echo "Backup $BACKUP_FILE replicated successfully"
#!/bin/bash
# Database recovery procedure
# 1. Stop application instances
kubectl scale deployment ruv-fann-api --replicas=0 -n ruv-fann-prod
# 2. Create recovery database
kubectl exec -it postgres-primary-0 -n ruv-fann-prod -- createdb ruv_fann_recovery
# 3. Download latest backup
LATEST_BACKUP=$(aws s3 ls s3://ruv-fann-backups/ --recursive | sort | tail -n 1 | awk '{print $4}')
aws s3 cp s3://ruv-fann-backups/$LATEST_BACKUP /tmp/
# 4. Restore database
gunzip -c /tmp/$LATEST_BACKUP | kubectl exec -i postgres-primary-0 -n ruv-fann-prod -- psql ruv_fann_recovery
# 5. Validate data integrity
kubectl exec -it postgres-primary-0 -n ruv-fann-prod -- psql ruv_fann_recovery -c "SELECT COUNT(*) FROM neural_networks;"
# 6. Switch to recovered database
kubectl patch secret ruv-fann-secrets -n ruv-fann-prod -p '{"data":{"database-url":"'$(echo "postgresql://user:pass@postgres:5432/ruv_fann_recovery" | base64)'"}}'
# 7. Restart application
kubectl scale deployment ruv-fann-api --replicas=3 -n ruv-fann-prod
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: disaster-recovery-rollout
spec:
strategy:
canary:
steps:
- setWeight: 0
- pause: {} # Manual approval required
- setWeight: 100
trafficRouting:
nginx:
stableIngress: ruv-fann-primary
annotationPrefix: nginx.ingress.kubernetes.io
additionalIngressAnnotations:
canary-by-header: X-Disaster-Recovery
Component | RTO | RPO | Strategy |
---|---|---|---|
Application | 5 minutes | 0 | Rolling deployment |
Database | 15 minutes | 5 minutes | Point-in-time recovery |
Cache | 2 minutes | 15 minutes | Cluster failover |
File Storage | 1 minute | 1 hour | Cross-region replication |
use sqlx::{Pool, Postgres};
use std::time::Duration;
pub async fn create_database_pool(database_url: &str) -> Pool<Postgres> {
sqlx::postgres::PgPoolOptions::new()
.max_connections(20)
.min_connections(5)
.acquire_timeout(Duration::from_secs(30))
.idle_timeout(Duration::from_secs(600))
.max_lifetime(Duration::from_secs(1800))
.connect(database_url)
.await
.expect("Failed to create database pool")
}
use redis::aio::ConnectionManager;
use serde::{Deserialize, Serialize};
use std::time::Duration;
#[derive(Serialize, Deserialize)]
pub struct CacheConfig {
pub ttl: Duration,
pub max_size: usize,
}
pub struct CacheManager {
redis: ConnectionManager,
config: CacheConfig,
}
impl CacheManager {
pub async fn get_or_compute<T, F, Fut>(&self, key: &str, compute: F) -> Result<T, CacheError>
where
T: Serialize + for<'de> Deserialize<'de>,
F: FnOnce() -> Fut,
Fut: Future<Output = Result<T, CacheError>>,
{
// Try cache first
if let Ok(cached) = self.get(key).await {
return Ok(cached);
}
// Compute and cache
let result = compute().await?;
self.set(key, &result, self.config.ttl).await?;
Ok(result)
}
}
-- Neural network lookup indexes
CREATE INDEX CONCURRENTLY idx_neural_networks_user_id ON neural_networks(user_id);
CREATE INDEX CONCURRENTLY idx_neural_networks_created_at ON neural_networks(created_at DESC);
CREATE INDEX CONCURRENTLY idx_neural_networks_status ON neural_networks(status) WHERE status IN ('training', 'ready');
-- Training data indexes
CREATE INDEX CONCURRENTLY idx_training_data_network_id ON training_data(network_id);
CREATE INDEX CONCURRENTLY idx_training_data_timestamp ON training_data(timestamp DESC);
-- Composite indexes for common queries
CREATE INDEX CONCURRENTLY idx_nn_user_status ON neural_networks(user_id, status) INCLUDE (name, created_at);
-- Optimized query with proper joins and filtering
EXPLAIN (ANALYZE, BUFFERS)
SELECT
nn.id,
nn.name,
nn.status,
COUNT(td.id) as training_samples
FROM neural_networks nn
LEFT JOIN training_data td ON nn.id = td.network_id
WHERE nn.user_id = $1
AND nn.status = 'ready'
AND nn.created_at >= $2
GROUP BY nn.id, nn.name, nn.status
ORDER BY nn.created_at DESC
LIMIT 10;
# PostgreSQL configuration
[database]
max_connections = 200
shared_buffers = "512MB"
effective_cache_size = "2GB"
maintenance_work_mem = "128MB"
checkpoint_completion_target = 0.9
wal_buffers = "32MB"
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 300
work_mem = "8MB"
huge_pages = "try"
use std::sync::Arc;
use tokio::sync::RwLock;
use lru::LruCache;
pub struct OptimizedNeuralNetwork {
// Use Arc for shared ownership
weights: Arc<Vec<f32>>,
// Use RwLock for concurrent read access
cache: Arc<RwLock<LruCache<String, Vec<f32>>>>,
// Pool connections
pool: Arc<DatabasePool>,
}
impl OptimizedNeuralNetwork {
pub fn new(capacity: usize, pool: Arc<DatabasePool>) -> Self {
Self {
weights: Arc::new(Vec::with_capacity(capacity)),
cache: Arc::new(RwLock::new(LruCache::new(1000))),
pool,
}
}
pub async fn predict(&self, input: &[f32]) -> Result<Vec<f32>, PredictionError> {
let cache_key = format!("prediction_{}", hash_input(input));
// Check cache first
{
let cache = self.cache.read().await;
if let Some(result) = cache.peek(&cache_key) {
return Ok(result.clone());
}
}
// Compute prediction
let result = self.compute_prediction(input).await?;
// Update cache
{
let mut cache = self.cache.write().await;
cache.put(cache_key, result.clone());
}
Ok(result)
}
}
use rayon::prelude::*;
use std::sync::Arc;
pub struct ParallelProcessor {
thread_pool: rayon::ThreadPool,
}
impl ParallelProcessor {
pub fn new(threads: usize) -> Self {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.thread_name(|i| format!("ruv-fann-worker-{}", i))
.build()
.expect("Failed to create thread pool");
Self { thread_pool }
}
pub fn process_batch(&self, data: Vec<InputData>) -> Vec<OutputData> {
self.thread_pool.install(|| {
data.par_iter()
.map(|input| self.process_single(input))
.collect()
})
}
}
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: ruv-fann-network-policy
namespace: ruv-fann-prod
spec:
podSelector:
matchLabels:
app: ruv-fann-api
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
ports:
- protocol: TCP
port: 8080
egress:
- to:
- namespaceSelector:
matchLabels:
name: postgres
ports:
- protocol: TCP
port: 5432
- to:
- namespaceSelector:
matchLabels:
name: redis
ports:
- protocol: TCP
port: 6379
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
name: ruv-fann-secrets
namespace: ruv-fann-prod
spec:
encryptedData:
database-url: AgBy3i4OJSWK+PiTySYZZA9rO43cGDEQAx...
redis-url: AgBy3i4OJSWK+PiTySYZZA9rO43cGDEQAx...
jwt-secret: AgBy3i4OJSWK+PiTySYZZA9rO43cGDEQAx...
api-key: AgBy3i4OJSWK+PiTySYZZA9rO43cGDEQAx...
template:
metadata:
name: ruv-fann-secrets
namespace: ruv-fann-prod
apiVersion: v1
kind: Pod
metadata:
name: security-scan
spec:
containers:
- name: trivy-scanner
image: aquasec/trivy:latest
command:
- trivy
- image
- --exit-code
- "1"
- --severity
- HIGH,CRITICAL
- ruv-fann:latest
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "ruv_fann_rules.yml"
scrape_configs:
- job_name: 'ruv-fann'
static_configs:
- targets: ['ruv-fann-service:9090']
metrics_path: /metrics
scrape_interval: 10s
- job_name: 'postgres-exporter'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'redis-exporter'
static_configs:
- targets: ['redis-exporter:9121']
use prometheus::{Counter, Histogram, IntGauge, Registry};
use std::sync::Arc;
pub struct Metrics {
pub requests_total: Counter,
pub request_duration: Histogram,
pub active_connections: IntGauge,
pub neural_networks_total: IntGauge,
}
impl Metrics {
pub fn new(registry: &Registry) -> Arc<Self> {
let metrics = Arc::new(Self {
requests_total: Counter::new(
"ruv_fann_requests_total",
"Total number of requests"
).unwrap(),
request_duration: Histogram::with_opts(
prometheus::HistogramOpts::new(
"ruv_fann_request_duration_seconds",
"Request duration in seconds"
).buckets(vec![0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0])
).unwrap(),
active_connections: IntGauge::new(
"ruv_fann_active_connections",
"Number of active database connections"
).unwrap(),
neural_networks_total: IntGauge::new(
"ruv_fann_neural_networks_total",
"Total number of neural networks"
).unwrap(),
});
registry.register(Box::new(metrics.requests_total.clone())).unwrap();
registry.register(Box::new(metrics.request_duration.clone())).unwrap();
registry.register(Box::new(metrics.active_connections.clone())).unwrap();
registry.register(Box::new(metrics.neural_networks_total.clone())).unwrap();
metrics
}
}
use tracing::{info, warn, error, instrument};
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
pub fn init_logging() {
tracing_subscriber::registry()
.with(tracing_subscriber::EnvFilter::new(
std::env::var("RUST_LOG")
.unwrap_or_else(|_| "ruv_fann=info".into()),
))
.with(tracing_subscriber::fmt::layer().json())
.init();
}
#[instrument(fields(user_id = %user_id, model_id = %model_id))]
pub async fn train_model(user_id: u64, model_id: u64) -> Result<(), TrainingError> {
info!("Starting model training");
match perform_training(user_id, model_id).await {
Ok(()) => {
info!("Model training completed successfully");
Ok(())
}
Err(e) => {
error!("Model training failed: {}", e);
Err(e)
}
}
}
groups:
- name: ruv-fann.rules
rules:
- alert: HighErrorRate
expr: (rate(ruv_fann_requests_total{status=~"5.."}[5m]) / rate(ruv_fann_requests_total[5m])) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} for the last 5 minutes"
- alert: HighLatency
expr: histogram_quantile(0.95, rate(ruv_fann_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "95th percentile latency is {{ $value }}s"
- alert: DatabaseConnectionsHigh
expr: ruv_fann_active_connections > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High number of database connections"
description: "{{ $value }} active connections"
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ruv-fann-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ruv-fann-api
minReplicas: 3
maxReplicas: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: requests_per_second
target:
type: AverageValue
averageValue: "1000"
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: ruv-fann-vpa
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: ruv-fann-api
updatePolicy:
updateMode: "Auto"
resourcePolicy:
containerPolicies:
- containerName: ruv-fann
maxAllowed:
cpu: 4
memory: 8Gi
minAllowed:
cpu: 100m
memory: 512Mi
controlledResources: ["cpu", "memory"]
# Check database connectivity
kubectl exec -it ruv-fann-api-xxx -- sh
nc -zv postgres-service 5432
# Check connection pool status
curl http://localhost:8080/debug/pool-status
# Reset connection pool
curl -X POST http://localhost:8080/debug/reset-pool
# Check memory usage
kubectl top pods -n ruv-fann-prod
# Generate heap dump
curl -X POST http://localhost:8080/debug/heap-dump > heap.dump
# Analyze with profiling tools
cargo flamegraph --bin ruv-fann
# Check CPU usage
kubectl top pods -n ruv-fann-prod --sort-by=cpu
# Check database slow queries
kubectl exec -it postgres-primary-0 -- psql -c "
SELECT query, mean_time, calls
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;"
# Check Redis performance
kubectl exec -it redis-0 -- redis-cli --latency-history
use serde_json::json;
use axum::{Json, http::StatusCode};
pub async fn health_check(
State(app_state): State<AppState>,
) -> Result<Json<Value>, (StatusCode, Json<Value>)> {
let mut health_status = json!({
"status": "healthy",
"timestamp": chrono::Utc::now(),
"version": env!("CARGO_PKG_VERSION")
});
// Check database
match app_state.db.execute("SELECT 1").await {
Ok(_) => {
health_status["database"] = json!("healthy");
}
Err(e) => {
health_status["database"] = json!(format!("unhealthy: {}", e));
health_status["status"] = json!("unhealthy");
}
}
// Check Redis
match app_state.redis.ping().await {
Ok(_) => {
health_status["redis"] = json!("healthy");
}
Err(e) => {
health_status["redis"] = json!(format!("unhealthy: {}", e));
health_status["status"] = json!("unhealthy");
}
}
// Check external services
health_status["external_services"] = check_external_services().await;
if health_status["status"] == "healthy" {
Ok(Json(health_status))
} else {
Err((StatusCode::SERVICE_UNAVAILABLE, Json(health_status)))
}
}
# Disable specific service
kubectl patch deployment ruv-fann-api -p '{"spec":{"template":{"metadata":{"annotations":{"circuit-breaker/database":"open"}}}}}'
# Enable maintenance mode
kubectl patch configmap ruv-fann-config -p '{"data":{"maintenance_mode":"true"}}'
# Scale down to minimum
kubectl scale deployment ruv-fann-api --replicas=1
# Switch to read-only mode
kubectl exec -it postgres-primary-0 -- psql -c "ALTER SYSTEM SET default_transaction_read_only = on;"
kubectl exec -it postgres-primary-0 -- psql -c "SELECT pg_reload_conf();"
# Promote read replica
kubectl patch postgresql postgres-cluster --type merge -p '{"spec":{"instances":1}}'
# Emergency backup
kubectl exec postgres-primary-0 -- pg_dump ruv_fann | gzip > emergency_backup_$(date +%Y%m%d_%H%M%S).sql.gz
This comprehensive production deployment guide provides enterprise-grade strategies for deploying, scaling, and maintaining RUV-FANN in production environments. Regular review and testing of these procedures is essential for maintaining system reliability and performance.