How To Debug - ruvnet/ruv-FANN GitHub Wiki
A comprehensive troubleshooting and debugging guide for ruv-FANN neural networks, swarm coordination, and system integration issues.
ruv-FANN debugging involves multiple layers:
- Neural Network Debugging: Training convergence, gradient issues, architecture problems
- Swarm Coordination Debugging: Agent communication, task distribution, topology issues
- WASM Integration Debugging: WebAssembly compilation and runtime issues
- System Integration Debugging: Performance bottlenecks, memory leaks, CPU issues
- Production Debugging: Deployment issues, scaling problems, monitoring failures
use ruv_fann::prelude::*;
use micro_core::{RootVector, RootSpace};
pub struct GradientDiagnostics {
gradient_history: Vec<Vec<f32>>,
gradient_norms: Vec<f32>,
layer_activations: Vec<Vec<f32>>,
}
impl GradientDiagnostics {
pub fn new() -> Self {
Self {
gradient_history: Vec::new(),
gradient_norms: Vec::new(),
layer_activations: Vec::new(),
}
}
pub fn diagnose_gradient_flow(&mut self, network: &NeuralNetwork) -> GradientIssues {
let mut issues = GradientIssues::new();
// Check gradient magnitudes
for (layer_idx, gradients) in network.layer_gradients().iter().enumerate() {
let grad_norm: f32 = gradients.iter().map(|g| g * g).sum::<f32>().sqrt();
self.gradient_norms.push(grad_norm);
// Detect vanishing gradients
if grad_norm < 1e-7 {
issues.vanishing_gradients.push(VanishingGradient {
layer: layer_idx,
norm: grad_norm,
severity: if grad_norm < 1e-10 { Severity::Critical } else { Severity::High },
});
println!("๐ด Vanishing gradient detected in layer {}: norm = {:.2e}", layer_idx, grad_norm);
}
// Detect exploding gradients
if grad_norm > 100.0 {
issues.exploding_gradients.push(ExplodingGradient {
layer: layer_idx,
norm: grad_norm,
severity: if grad_norm > 1000.0 { Severity::Critical } else { Severity::High },
});
println!("๐ด Exploding gradient detected in layer {}: norm = {:.2e}", layer_idx, grad_norm);
}
}
// Check activation distributions
for (layer_idx, activations) in network.layer_activations().iter().enumerate() {
let mean: f32 = activations.iter().sum::<f32>() / activations.len() as f32;
let std_dev: f32 = {
let variance: f32 = activations.iter()
.map(|a| (a - mean).powi(2))
.sum::<f32>() / activations.len() as f32;
variance.sqrt()
};
// Detect dead neurons (all zeros)
let dead_neurons = activations.iter().filter(|&&a| a.abs() < 1e-6).count();
let dead_ratio = dead_neurons as f32 / activations.len() as f32;
if dead_ratio > 0.5 {
issues.dead_neurons.push(DeadNeurons {
layer: layer_idx,
dead_ratio,
total_neurons: activations.len(),
});
println!("๐ด Dead neurons detected in layer {}: {:.1}% inactive", layer_idx, dead_ratio * 100.0);
}
// Detect saturated activations
let saturated = activations.iter().filter(|&&a| a.abs() > 0.99).count();
let saturation_ratio = saturated as f32 / activations.len() as f32;
if saturation_ratio > 0.3 {
issues.saturated_activations.push(SaturatedActivations {
layer: layer_idx,
saturation_ratio,
mean_activation: mean,
std_dev,
});
println!("๐ด Saturated activations in layer {}: {:.1}% saturated", layer_idx, saturation_ratio * 100.0);
}
}
self.suggest_fixes(&issues);
issues
}
fn suggest_fixes(&self, issues: &GradientIssues) {
println!("\n๐ง Suggested Fixes:");
if !issues.vanishing_gradients.is_empty() {
println!("For vanishing gradients:");
println!(" - Use ReLU or LeakyReLU activation functions");
println!(" - Implement residual connections (skip connections)");
println!(" - Use batch normalization");
println!(" - Reduce network depth or increase learning rate");
println!(" - Consider LSTM/GRU for sequential data");
}
if !issues.exploding_gradients.is_empty() {
println!("For exploding gradients:");
println!(" - Implement gradient clipping (clip to [-1, 1] or [-5, 5])");
println!(" - Reduce learning rate");
println!(" - Use batch normalization");
println!(" - Check weight initialization (Xavier/He initialization)");
println!(" - Add L2 regularization");
}
if !issues.dead_neurons.is_empty() {
println!("For dead neurons:");
println!(" - Use LeakyReLU instead of ReLU");
println!(" - Reduce learning rate");
println!(" - Check weight initialization");
println!(" - Add batch normalization");
println!(" - Implement dropout for regularization");
}
}
}
// Automated gradient clipping
pub struct GradientClipper {
max_norm: f32,
clip_method: ClipMethod,
}
#[derive(Debug, Clone)]
pub enum ClipMethod {
GlobalNorm,
LayerWise,
Value,
}
impl GradientClipper {
pub fn clip_gradients(&self, network: &mut NeuralNetwork) -> ClipResult {
match self.clip_method {
ClipMethod::GlobalNorm => self.clip_by_global_norm(network),
ClipMethod::LayerWise => self.clip_by_layer(network),
ClipMethod::Value => self.clip_by_value(network),
}
}
fn clip_by_global_norm(&self, network: &mut NeuralNetwork) -> ClipResult {
// Calculate global gradient norm
let mut global_norm = 0.0f32;
for gradients in network.layer_gradients() {
global_norm += gradients.iter().map(|g| g * g).sum::<f32>();
}
global_norm = global_norm.sqrt();
if global_norm > self.max_norm {
let scale_factor = self.max_norm / global_norm;
// Scale all gradients
for gradients in network.layer_gradients_mut() {
for gradient in gradients.iter_mut() {
*gradient *= scale_factor;
}
}
println!("๐ง Global gradient clipping applied: {:.2e} -> {:.2e}", global_norm, self.max_norm);
ClipResult::Clipped { original_norm: global_norm, clipped_norm: self.max_norm }
} else {
ClipResult::NoClipping { norm: global_norm }
}
}
}
pub struct LossAnalyzer {
loss_history: Vec<f32>,
learning_rate_history: Vec<f32>,
batch_losses: Vec<Vec<f32>>,
}
impl LossAnalyzer {
pub fn analyze_training_progress(&self) -> TrainingDiagnosis {
let mut diagnosis = TrainingDiagnosis::new();
if self.loss_history.len() < 10 {
return diagnosis; // Not enough data
}
// Check for convergence
let recent_losses = &self.loss_history[self.loss_history.len()-10..];
let loss_variance: f32 = {
let mean: f32 = recent_losses.iter().sum::<f32>() / recent_losses.len() as f32;
recent_losses.iter().map(|l| (l - mean).powi(2)).sum::<f32>() / recent_losses.len() as f32
};
if loss_variance < 1e-6 {
diagnosis.issues.push(TrainingIssue::Converged);
println!("โ
Training appears to have converged (low loss variance)");
}
// Check for oscillations
let mut direction_changes = 0;
for window in self.loss_history.windows(3) {
if (window[1] > window[0] && window[1] > window[2]) ||
(window[1] < window[0] && window[1] < window[2]) {
direction_changes += 1;
}
}
let oscillation_ratio = direction_changes as f32 / (self.loss_history.len() - 2) as f32;
if oscillation_ratio > 0.3 {
diagnosis.issues.push(TrainingIssue::Oscillating { ratio: oscillation_ratio });
println!("๐ด Loss oscillation detected: {:.1}% of steps", oscillation_ratio * 100.0);
println!(" Suggested fix: Reduce learning rate by 2-5x");
}
// Check for plateau
let last_50_losses = if self.loss_history.len() > 50 {
&self.loss_history[self.loss_history.len()-50..]
} else {
&self.loss_history
};
let improvement = last_50_losses.first().unwrap() - last_50_losses.last().unwrap();
let relative_improvement = improvement / last_50_losses.first().unwrap();
if relative_improvement < 0.01 && self.loss_history.len() > 50 {
diagnosis.issues.push(TrainingIssue::Plateau { improvement: relative_improvement });
println!("๐ด Training plateau detected: {:.3}% improvement in last 50 epochs", relative_improvement * 100.0);
println!(" Suggested fixes:");
println!(" - Reduce learning rate");
println!(" - Add learning rate scheduling");
println!(" - Increase model capacity");
println!(" - Add data augmentation");
println!(" - Check for overfitting");
}
// Check loss magnitude
let current_loss = *self.loss_history.last().unwrap();
if current_loss > 10.0 {
diagnosis.issues.push(TrainingIssue::HighLoss { loss: current_loss });
println!("๐ด High loss detected: {:.2}", current_loss);
println!(" Possible causes:");
println!(" - Learning rate too high");
println!(" - Poor weight initialization");
println!(" - Data preprocessing issues");
println!(" - Wrong loss function for task");
}
diagnosis
}
pub fn plot_loss_history(&self) -> String {
if self.loss_history.is_empty() {
return "No loss history available".to_string();
}
let max_loss = *self.loss_history.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap();
let min_loss = *self.loss_history.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap();
let mut plot = String::new();
plot.push_str(&format!("Loss History (Range: {:.4} - {:.4})\n", min_loss, max_loss));
plot.push_str("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n");
let height = 20;
for row in 0..height {
let threshold = max_loss - (max_loss - min_loss) * row as f32 / height as f32;
plot.push('โ');
for (i, &loss) in self.loss_history.iter().enumerate() {
if i % (self.loss_history.len() / 60).max(1) == 0 { // Sample points
if (loss - threshold).abs() < (max_loss - min_loss) / height as f32 {
plot.push('*');
} else if loss > threshold {
plot.push(' ');
} else {
plot.push('.');
}
}
}
plot.push_str(&format!("โ {:.4}\n", threshold));
}
plot.push_str("โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n");
plot.push_str(&format!(" 0{:>58}{}", "", self.loss_history.len()));
plot
}
}
pub struct ArchitectureAnalyzer {
network: NeuralNetwork,
analysis_results: Vec<LayerAnalysis>,
}
#[derive(Debug)]
pub struct LayerAnalysis {
layer_index: usize,
layer_type: String,
input_shape: Vec<usize>,
output_shape: Vec<usize>,
parameter_count: usize,
activation_stats: ActivationStats,
weight_stats: WeightStats,
computational_cost: f64,
}
impl ArchitectureAnalyzer {
pub fn analyze_architecture(&mut self) -> ArchitectureReport {
let mut report = ArchitectureReport::new();
println!("๐ Analyzing neural network architecture...");
// Analyze each layer
for (i, layer) in self.network.layers().iter().enumerate() {
let analysis = self.analyze_layer(i, layer);
println!("Layer {}: {} ({} -> {})",
i,
analysis.layer_type,
analysis.input_shape.iter().map(|s| s.to_string()).collect::<Vec<_>>().join("ร"),
analysis.output_shape.iter().map(|s| s.to_string()).collect::<Vec<_>>().join("ร")
);
println!(" Parameters: {}, FLOPS: {:.0}",
analysis.parameter_count,
analysis.computational_cost
);
// Check for potential issues
if analysis.activation_stats.dead_ratio > 0.5 {
report.warnings.push(format!("Layer {}: {:.1}% dead neurons", i, analysis.activation_stats.dead_ratio * 100.0));
}
if analysis.weight_stats.gradient_norm < 1e-6 {
report.warnings.push(format!("Layer {}: Very small gradients (vanishing gradient problem)", i));
}
if analysis.parameter_count == 0 {
report.warnings.push(format!("Layer {}: No trainable parameters", i));
}
self.analysis_results.push(analysis);
}
// Overall architecture analysis
let total_params: usize = self.analysis_results.iter().map(|a| a.parameter_count).sum();
let total_flops: f64 = self.analysis_results.iter().map(|a| a.computational_cost).sum();
report.total_parameters = total_params;
report.total_flops = total_flops;
println!("\n๐ Architecture Summary:");
println!(" Total parameters: {}", format_number(total_params));
println!(" Total FLOPS: {}", format_number(total_flops as usize));
println!(" Memory usage (fp32): {:.1} MB", (total_params * 4) as f64 / 1024.0 / 1024.0);
// Detect common architecture problems
self.detect_architecture_issues(&mut report);
report
}
fn detect_architecture_issues(&self, report: &mut ArchitectureReport) {
// Check for bottlenecks
for window in self.analysis_results.windows(2) {
let prev_output = window[0].output_shape.iter().product::<usize>();
let curr_input = window[1].input_shape.iter().product::<usize>();
if curr_input < prev_output / 10 { // Dramatic reduction
report.warnings.push(format!(
"Potential bottleneck between layers {} and {}: {} -> {}",
window[0].layer_index,
window[1].layer_index,
prev_output,
curr_input
));
}
}
// Check for parameter imbalance
let param_counts: Vec<usize> = self.analysis_results.iter().map(|a| a.parameter_count).collect();
if let (Some(&max_params), Some(&min_params)) = (param_counts.iter().max(), param_counts.iter().min()) {
if max_params > min_params * 100 && min_params > 0 {
report.warnings.push(format!(
"Parameter imbalance: max layer has {}x more parameters than min layer",
max_params / min_params
));
}
}
// Check network depth
if self.analysis_results.len() > 50 {
report.warnings.push("Very deep network (>50 layers) may suffer from vanishing gradients".to_string());
}
// Check network width
let max_width = self.analysis_results.iter()
.map(|a| a.output_shape.iter().product::<usize>())
.max()
.unwrap_or(0);
if max_width > 10000 {
report.warnings.push(format!("Very wide layer ({} neurons) may cause memory issues", max_width));
}
}
fn analyze_layer(&self, index: usize, layer: &Layer) -> LayerAnalysis {
// This would be implemented based on the actual Layer structure
LayerAnalysis {
layer_index: index,
layer_type: format!("{:?}", layer.layer_type()),
input_shape: layer.input_shape().to_vec(),
output_shape: layer.output_shape().to_vec(),
parameter_count: layer.parameter_count(),
activation_stats: self.compute_activation_stats(layer),
weight_stats: self.compute_weight_stats(layer),
computational_cost: self.estimate_flops(layer),
}
}
}
fn format_number(n: usize) -> String {
if n >= 1_000_000_000 {
format!("{:.1}B", n as f64 / 1_000_000_000.0)
} else if n >= 1_000_000 {
format!("{:.1}M", n as f64 / 1_000_000.0)
} else if n >= 1_000 {
format!("{:.1}K", n as f64 / 1_000.0)
} else {
n.to_string()
}
}
use micro_swarm::{AgentId, Message, MessageType};
use std::collections::{HashMap, VecDeque};
pub struct MessageTracer {
message_log: VecDeque<TracedMessage>,
agent_states: HashMap<AgentId, AgentState>,
communication_graph: HashMap<AgentId, Vec<AgentId>>,
max_log_size: usize,
}
#[derive(Debug, Clone)]
pub struct TracedMessage {
timestamp: std::time::Instant,
sender: AgentId,
recipient: AgentId,
message_type: MessageType,
payload_size: usize,
processing_time: Option<Duration>,
success: bool,
error: Option<String>,
}
impl MessageTracer {
pub fn new(max_log_size: usize) -> Self {
Self {
message_log: VecDeque::with_capacity(max_log_size),
agent_states: HashMap::new(),
communication_graph: HashMap::new(),
max_log_size,
}
}
pub fn trace_message(&mut self, message: &Message, processing_result: Result<(), String>) {
let traced_msg = TracedMessage {
timestamp: std::time::Instant::now(),
sender: message.sender,
recipient: message.recipient,
message_type: message.message_type.clone(),
payload_size: message.payload.len(),
processing_time: message.processing_time,
success: processing_result.is_ok(),
error: processing_result.err(),
};
// Update communication graph
self.communication_graph
.entry(message.sender)
.or_insert_with(Vec::new)
.push(message.recipient);
// Add to log (maintain size limit)
if self.message_log.len() >= self.max_log_size {
self.message_log.pop_front();
}
self.message_log.push_back(traced_msg);
}
pub fn analyze_communication_patterns(&self) -> CommunicationAnalysis {
let mut analysis = CommunicationAnalysis::new();
// Analyze message failure rates
let total_messages = self.message_log.len();
let failed_messages = self.message_log.iter().filter(|m| !m.success).count();
analysis.overall_failure_rate = failed_messages as f64 / total_messages as f64;
// Analyze per-agent failure rates
let mut agent_failures: HashMap<AgentId, (usize, usize)> = HashMap::new();
for msg in &self.message_log {
let entry = agent_failures.entry(msg.sender).or_insert((0, 0));
entry.1 += 1; // total messages
if !msg.success {
entry.0 += 1; // failed messages
}
}
for (agent_id, (failures, total)) in agent_failures {
let failure_rate = failures as f64 / total as f64;
if failure_rate > 0.1 { // More than 10% failure rate
analysis.problematic_agents.push(ProblematicAgent {
agent_id,
failure_rate,
total_messages: total,
common_errors: self.get_common_errors_for_agent(agent_id),
});
}
}
// Analyze communication delays
let processing_times: Vec<Duration> = self.message_log.iter()
.filter_map(|m| m.processing_time)
.collect();
if !processing_times.is_empty() {
let total_time: Duration = processing_times.iter().sum();
analysis.average_processing_time = total_time / processing_times.len() as u32;
let mut sorted_times = processing_times;
sorted_times.sort();
analysis.median_processing_time = sorted_times[sorted_times.len() / 2];
analysis.p95_processing_time = sorted_times[(sorted_times.len() as f64 * 0.95) as usize];
}
// Detect communication bottlenecks
self.detect_bottlenecks(&mut analysis);
println!("๐ Communication Analysis:");
println!(" Overall failure rate: {:.2}%", analysis.overall_failure_rate * 100.0);
println!(" Average processing time: {:?}", analysis.average_processing_time);
println!(" 95th percentile time: {:?}", analysis.p95_processing_time);
println!(" Problematic agents: {}", analysis.problematic_agents.len());
for agent in &analysis.problematic_agents {
println!(" Agent {:?}: {:.1}% failure rate", agent.agent_id, agent.failure_rate * 100.0);
for error in &agent.common_errors {
println!(" Common error: {}", error);
}
}
analysis
}
fn detect_bottlenecks(&self, analysis: &mut CommunicationAnalysis) {
// Find agents that receive many messages but respond slowly
let mut message_counts: HashMap<AgentId, usize> = HashMap::new();
let mut slow_responders: HashMap<AgentId, Vec<Duration>> = HashMap::new();
for msg in &self.message_log {
*message_counts.entry(msg.recipient).or_insert(0) += 1;
if let Some(processing_time) = msg.processing_time {
if processing_time > Duration::from_millis(100) { // Slow threshold
slow_responders.entry(msg.recipient)
.or_insert_with(Vec::new)
.push(processing_time);
}
}
}
for (agent_id, slow_times) in slow_responders {
let message_count = message_counts.get(&agent_id).unwrap_or(&0);
let slow_ratio = slow_times.len() as f64 / *message_count as f64;
if slow_ratio > 0.3 { // More than 30% of messages are slow
analysis.bottlenecks.push(CommunicationBottleneck {
agent_id,
slow_message_ratio: slow_ratio,
average_slow_time: slow_times.iter().sum::<Duration>() / slow_times.len() as u32,
total_messages: *message_count,
});
}
}
}
pub fn generate_communication_graph(&self) -> String {
let mut graph = String::new();
graph.push_str("digraph communication {\n");
graph.push_str(" rankdir=LR;\n");
graph.push_str(" node [shape=circle];\n");
// Add agents as nodes
for agent_id in self.agent_states.keys() {
let color = if self.is_problematic_agent(*agent_id) {
"red"
} else {
"lightblue"
};
graph.push_str(&format!(" {:?} [fillcolor={}, style=filled];\n", agent_id, color));
}
// Add communication edges
let mut edge_weights: HashMap<(AgentId, AgentId), usize> = HashMap::new();
for msg in &self.message_log {
*edge_weights.entry((msg.sender, msg.recipient)).or_insert(0) += 1;
}
for ((sender, recipient), weight) in edge_weights {
let thickness = (weight as f64 / 10.0).max(1.0).min(5.0);
graph.push_str(&format!(
" {:?} -> {:?} [label=\"{}\", penwidth={}];\n",
sender, recipient, weight, thickness
));
}
graph.push_str("}\n");
graph
}
fn is_problematic_agent(&self, agent_id: AgentId) -> bool {
let failures = self.message_log.iter()
.filter(|m| m.sender == agent_id && !m.success)
.count();
let total = self.message_log.iter()
.filter(|m| m.sender == agent_id)
.count();
if total == 0 { return false; }
failures as f64 / total as f64 > 0.1
}
}
pub struct AgentHealthMonitor {
agent_metrics: HashMap<AgentId, AgentHealthMetrics>,
health_thresholds: HealthThresholds,
monitoring_interval: Duration,
}
#[derive(Debug, Clone)]
pub struct AgentHealthMetrics {
cpu_usage: f64,
memory_usage: usize,
task_completion_rate: f64,
error_rate: f64,
response_time: Duration,
last_heartbeat: Instant,
status: AgentStatus,
}
#[derive(Debug, Clone)]
pub enum AgentStatus {
Healthy,
Degraded,
Unhealthy,
Unresponsive,
}
impl AgentHealthMonitor {
pub async fn monitor_agents(&mut self, orchestrator: &SwarmOrchestrator) {
let mut interval = tokio::time::interval(self.monitoring_interval);
loop {
interval.tick().await;
for agent_id in orchestrator.active_agents() {
let metrics = self.collect_agent_metrics(agent_id, orchestrator).await;
let health_status = self.assess_agent_health(&metrics);
// Update metrics
if let Some(existing_metrics) = self.agent_metrics.get_mut(&agent_id) {
*existing_metrics = metrics.clone();
} else {
self.agent_metrics.insert(agent_id, metrics.clone());
}
// Take action based on health status
match health_status {
AgentStatus::Unhealthy => {
println!("๐ด Agent {:?} is unhealthy", agent_id);
self.handle_unhealthy_agent(agent_id, orchestrator).await;
}
AgentStatus::Unresponsive => {
println!("๐ Agent {:?} is unresponsive", agent_id);
self.handle_unresponsive_agent(agent_id, orchestrator).await;
}
AgentStatus::Degraded => {
println!("โ ๏ธ Agent {:?} performance is degraded", agent_id);
self.handle_degraded_agent(agent_id, orchestrator).await;
}
AgentStatus::Healthy => {
// Agent is fine, no action needed
}
}
}
}
}
async fn handle_unhealthy_agent(&self, agent_id: AgentId, orchestrator: &SwarmOrchestrator) {
// Try to restart the agent
if let Err(e) = orchestrator.restart_agent(agent_id).await {
println!("Failed to restart agent {:?}: {}", agent_id, e);
// If restart fails, spawn a replacement
if let Some(agent_type) = orchestrator.get_agent_type(agent_id) {
if let Err(e) = orchestrator.spawn_agent(agent_type, "replacement").await {
println!("Failed to spawn replacement agent: {}", e);
}
}
// Remove the unhealthy agent
orchestrator.remove_agent(agent_id).await.ok();
}
}
async fn handle_unresponsive_agent(&self, agent_id: AgentId, orchestrator: &SwarmOrchestrator) {
println!("Forcefully terminating unresponsive agent {:?}", agent_id);
// Force terminate and spawn replacement
orchestrator.force_terminate_agent(agent_id).await.ok();
if let Some(agent_type) = orchestrator.get_agent_type(agent_id) {
orchestrator.spawn_agent(agent_type, "unresponsive-replacement").await.ok();
}
}
async fn handle_degraded_agent(&self, agent_id: AgentId, orchestrator: &SwarmOrchestrator) {
// Reduce task assignment to degraded agent
orchestrator.reduce_agent_load(agent_id, 0.5).await.ok();
// Monitor more frequently
println!("Monitoring degraded agent {:?} more closely", agent_id);
}
fn assess_agent_health(&self, metrics: &AgentHealthMetrics) -> AgentStatus {
let thresholds = &self.health_thresholds;
// Check if unresponsive
if metrics.last_heartbeat.elapsed() > Duration::from_secs(30) {
return AgentStatus::Unresponsive;
}
// Check if unhealthy
if metrics.error_rate > thresholds.max_error_rate ||
metrics.cpu_usage > thresholds.max_cpu_usage ||
metrics.memory_usage > thresholds.max_memory_usage ||
metrics.response_time > thresholds.max_response_time {
return AgentStatus::Unhealthy;
}
// Check if degraded
if metrics.error_rate > thresholds.degraded_error_rate ||
metrics.cpu_usage > thresholds.degraded_cpu_usage ||
metrics.task_completion_rate < thresholds.min_completion_rate {
return AgentStatus::Degraded;
}
AgentStatus::Healthy
}
pub fn print_health_report(&self) {
println!("\n๐ฅ Agent Health Report");
println!("โโโโโโโโโโโโโโโโโโโโโโโโ");
let mut healthy = 0;
let mut degraded = 0;
let mut unhealthy = 0;
let mut unresponsive = 0;
for (agent_id, metrics) in &self.agent_metrics {
match metrics.status {
AgentStatus::Healthy => healthy += 1,
AgentStatus::Degraded => degraded += 1,
AgentStatus::Unhealthy => unhealthy += 1,
AgentStatus::Unresponsive => unresponsive += 1,
}
println!("Agent {:?}: {:?}", agent_id, metrics.status);
println!(" CPU: {:.1}%, Memory: {} MB",
metrics.cpu_usage,
metrics.memory_usage / 1024 / 1024
);
println!(" Completion rate: {:.1}%, Error rate: {:.1}%",
metrics.task_completion_rate * 100.0,
metrics.error_rate * 100.0
);
println!(" Response time: {:?}", metrics.response_time);
println!();
}
println!("Summary:");
println!(" โ
Healthy: {}", healthy);
println!(" โ ๏ธ Degraded: {}", degraded);
println!(" ๐ด Unhealthy: {}", unhealthy);
println!(" ๐ Unresponsive: {}", unresponsive);
}
}
pub struct CoordinationAnalyzer {
latency_measurements: Vec<LatencyMeasurement>,
topology_metrics: TopologyMetrics,
}
#[derive(Debug, Clone)]
pub struct LatencyMeasurement {
timestamp: Instant,
operation_type: CoordinationOperation,
latency: Duration,
success: bool,
participating_agents: Vec<AgentId>,
}
impl CoordinationAnalyzer {
pub fn analyze_coordination_performance(&self) -> CoordinationReport {
let mut report = CoordinationReport::new();
// Analyze latency patterns
let latencies: Vec<Duration> = self.latency_measurements
.iter()
.filter(|m| m.success)
.map(|m| m.latency)
.collect();
if !latencies.is_empty() {
report.average_latency = latencies.iter().sum::<Duration>() / latencies.len() as u32;
let mut sorted_latencies = latencies.clone();
sorted_latencies.sort();
report.median_latency = sorted_latencies[sorted_latencies.len() / 2];
report.p95_latency = sorted_latencies[(sorted_latencies.len() as f64 * 0.95) as usize];
report.p99_latency = sorted_latencies[(sorted_latencies.len() as f64 * 0.99) as usize];
}
// Analyze by operation type
let mut operation_stats: HashMap<CoordinationOperation, Vec<Duration>> = HashMap::new();
for measurement in &self.latency_measurements {
if measurement.success {
operation_stats.entry(measurement.operation_type.clone())
.or_insert_with(Vec::new)
.push(measurement.latency);
}
}
for (operation, latencies) in operation_stats {
let avg_latency = latencies.iter().sum::<Duration>() / latencies.len() as u32;
report.operation_latencies.insert(operation.clone(), avg_latency);
// Identify problematic operations
if avg_latency > Duration::from_millis(500) {
report.slow_operations.push(SlowOperation {
operation,
average_latency: avg_latency,
measurement_count: latencies.len(),
});
}
}
// Analyze coordination failures
let total_operations = self.latency_measurements.len();
let failed_operations = self.latency_measurements.iter().filter(|m| !m.success).count();
report.failure_rate = failed_operations as f64 / total_operations as f64;
// Detect coordination bottlenecks
self.detect_coordination_bottlenecks(&mut report);
println!("๐ Coordination Performance Analysis:");
println!(" Average latency: {:?}", report.average_latency);
println!(" 95th percentile: {:?}", report.p95_latency);
println!(" Failure rate: {:.2}%", report.failure_rate * 100.0);
if !report.slow_operations.is_empty() {
println!(" Slow operations:");
for slow_op in &report.slow_operations {
println!(" {:?}: {:?} ({}x measured)",
slow_op.operation,
slow_op.average_latency,
slow_op.measurement_count
);
}
}
report
}
fn detect_coordination_bottlenecks(&self, report: &mut CoordinationReport) {
// Group measurements by number of participating agents
let mut agent_count_latencies: HashMap<usize, Vec<Duration>> = HashMap::new();
for measurement in &self.latency_measurements {
if measurement.success {
agent_count_latencies
.entry(measurement.participating_agents.len())
.or_insert_with(Vec::new)
.push(measurement.latency);
}
}
// Check if latency increases dramatically with agent count
let mut sorted_by_count: Vec<_> = agent_count_latencies.iter().collect();
sorted_by_count.sort_by_key(|(count, _)| *count);
for window in sorted_by_count.windows(2) {
let (smaller_count, smaller_latencies) = window[0];
let (larger_count, larger_latencies) = window[1];
let smaller_avg = smaller_latencies.iter().sum::<Duration>() / smaller_latencies.len() as u32;
let larger_avg = larger_latencies.iter().sum::<Duration>() / larger_latencies.len() as u32;
let latency_ratio = larger_avg.as_millis() as f64 / smaller_avg.as_millis() as f64;
let count_ratio = *larger_count as f64 / *smaller_count as f64;
// If latency increases much faster than agent count, there's a bottleneck
if latency_ratio > count_ratio * 2.0 {
report.bottlenecks.push(CoordinationBottleneck {
description: format!(
"Coordination latency increases {:.1}x when going from {} to {} agents",
latency_ratio, smaller_count, larger_count
),
severity: if latency_ratio > count_ratio * 5.0 { Severity::High } else { Severity::Medium },
});
}
}
// Check for topology-specific issues
self.analyze_topology_efficiency(report);
}
fn analyze_topology_efficiency(&self, report: &mut CoordinationReport) {
// This would analyze the current topology's efficiency
// For example, in mesh topology, check if all-to-all communication is causing delays
// In hierarchical topology, check if the coordinator is becoming a bottleneck
match self.topology_metrics.topology_type {
TopologyType::Mesh => {
let agent_count = self.topology_metrics.agent_count;
let theoretical_messages = agent_count * (agent_count - 1); // O(nยฒ) for mesh
if theoretical_messages > 100 {
report.recommendations.push(
"Consider switching to hierarchical topology to reduce O(nยฒ) communication overhead".to_string()
);
}
}
TopologyType::Hierarchical => {
// Check if coordinator is overloaded
let coordinator_messages = self.latency_measurements.iter()
.filter(|m| m.participating_agents.contains(&self.topology_metrics.coordinator_id.unwrap()))
.count();
let total_messages = self.latency_measurements.len();
let coordinator_ratio = coordinator_messages as f64 / total_messages as f64;
if coordinator_ratio > 0.7 {
report.recommendations.push(
"Coordinator is handling too many messages - consider multiple coordinators or switch topology".to_string()
);
}
}
_ => {}
}
}
pub fn suggest_optimizations(&self, report: &CoordinationReport) -> Vec<OptimizationSuggestion> {
let mut suggestions = Vec::new();
// High latency suggestions
if report.average_latency > Duration::from_millis(200) {
suggestions.push(OptimizationSuggestion {
category: OptimizationCategory::Latency,
description: "High coordination latency detected".to_string(),
actions: vec![
"Reduce message size through compression".to_string(),
"Implement message batching".to_string(),
"Optimize network buffer sizes".to_string(),
"Consider changing topology".to_string(),
],
expected_improvement: "20-50% latency reduction".to_string(),
});
}
// High failure rate suggestions
if report.failure_rate > 0.05 {
suggestions.push(OptimizationSuggestion {
category: OptimizationCategory::Reliability,
description: format!("High coordination failure rate: {:.1}%", report.failure_rate * 100.0),
actions: vec![
"Implement retry logic with exponential backoff".to_string(),
"Add circuit breaker pattern".to_string(),
"Increase timeout values".to_string(),
"Improve error handling".to_string(),
],
expected_improvement: "Reduce failure rate to <2%".to_string(),
});
}
// Bottleneck-specific suggestions
for bottleneck in &report.bottlenecks {
suggestions.push(OptimizationSuggestion {
category: OptimizationCategory::Bottleneck,
description: bottleneck.description.clone(),
actions: vec![
"Profile the coordination code path".to_string(),
"Implement asynchronous coordination".to_string(),
"Use more efficient data structures".to_string(),
"Consider topology changes".to_string(),
],
expected_improvement: "Eliminate coordination bottleneck".to_string(),
});
}
suggestions
}
}
#!/bin/bash
# wasm-debug.sh - WASM debugging helper script
echo "๐ WASM Build Diagnostics"
echo "========================="
# Check Rust toolchain
echo "Checking Rust toolchain..."
rustc --version
cargo --version
# Check WASM target
echo "Checking WASM target..."
rustup target list | grep wasm32-unknown-unknown
if ! rustup target list --installed | grep -q wasm32-unknown-unknown; then
echo "โ WASM target not installed"
echo "Run: rustup target add wasm32-unknown-unknown"
exit 1
fi
# Check wasm-pack
echo "Checking wasm-pack..."
if ! command -v wasm-pack &> /dev/null; then
echo "โ wasm-pack not found"
echo "Install with: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh"
exit 1
fi
wasm-pack --version
# Check wasm-opt (for optimization)
echo "Checking wasm-opt..."
if command -v wasm-opt &> /dev/null; then
wasm-opt --version
else
echo "โ ๏ธ wasm-opt not found (optional but recommended)"
echo "Install with: npm install -g binaryen"
fi
echo "โ
WASM toolchain check complete"
# Build with verbose output
echo "Building WASM with diagnostics..."
RUST_LOG=debug wasm-pack build --target web --dev --verbose
# Check output size
if [ -f "pkg/semantic_cartan_matrix_bg.wasm" ]; then
WASM_SIZE=$(wc -c < pkg/semantic_cartan_matrix_bg.wasm)
echo "๐ WASM binary size: $WASM_SIZE bytes ($(($WASM_SIZE / 1024)) KB)"
if [ $WASM_SIZE -gt 1048576 ]; then # > 1MB
echo "โ ๏ธ Large WASM binary detected. Consider optimization:"
echo " - Use --release flag for smaller builds"
echo " - Enable wee_alloc for smaller allocator"
echo " - Remove unused dependencies"
echo " - Use wasm-opt for post-build optimization"
fi
else
echo "โ WASM build failed - binary not found"
fi
# Analyze WASM binary
if command -v wasm-objdump &> /dev/null && [ -f "pkg/semantic_cartan_matrix_bg.wasm" ]; then
echo "๐ WASM binary analysis:"
wasm-objdump -h pkg/semantic_cartan_matrix_bg.wasm | head -20
fi
// wasm-debug.js - JavaScript WASM debugging utilities
class WasmDebugger {
constructor(wasmModule) {
this.wasmModule = wasmModule;
this.memory = wasmModule.memory;
this.performance_log = [];
this.error_log = [];
}
// Monitor memory usage
monitorMemory() {
const memoryPages = this.memory.buffer.byteLength / 65536; // 64KB per page
const memoryMB = this.memory.buffer.byteLength / 1024 / 1024;
console.log(`๐ WASM Memory: ${memoryPages} pages (${memoryMB.toFixed(2)} MB)`);
// Check for memory growth
if (memoryMB > 100) {
console.warn("โ ๏ธ High WASM memory usage detected");
console.warn("Consider:");
console.warn(" - Reducing batch size");
console.warn(" - Implementing memory pooling");
console.warn(" - Using streaming processing");
}
return { pages: memoryPages, megabytes: memoryMB };
}
// Wrap WASM function calls with error handling and performance monitoring
wrapFunction(functionName, wasmFunction) {
return (...args) => {
const startTime = performance.now();
try {
console.log(`๐ง Calling WASM function: ${functionName}`, args);
const result = wasmFunction.apply(this.wasmModule, args);
const endTime = performance.now();
const duration = endTime - startTime;
this.performance_log.push({
function: functionName,
duration,
timestamp: Date.now(),
success: true
});
console.log(`โ
${functionName} completed in ${duration.toFixed(2)}ms`);
// Monitor for performance issues
if (duration > 100) {
console.warn(`โ ๏ธ Slow WASM function: ${functionName} took ${duration.toFixed(2)}ms`);
}
return result;
} catch (error) {
const endTime = performance.now();
const duration = endTime - startTime;
this.error_log.push({
function: functionName,
error: error.message,
duration,
timestamp: Date.now(),
args: JSON.stringify(args)
});
console.error(`โ WASM function ${functionName} failed:`, error);
console.error(`Arguments:`, args);
console.error(`Duration before failure: ${duration.toFixed(2)}ms`);
// Try to provide helpful error context
this.analyzeError(functionName, error, args);
throw error;
}
};
}
analyzeError(functionName, error, args) {
console.group(`๐ Error Analysis for ${functionName}`);
// Check for common WASM errors
if (error.message.includes('unreachable')) {
console.error("Rust panic detected in WASM code");
console.error("Common causes:");
console.error(" - Array bounds violation");
console.error(" - Integer overflow");
console.error(" - Assertion failure");
console.error(" - Division by zero");
} else if (error.message.includes('out of bounds')) {
console.error("Memory access violation");
console.error("Check array indices and memory allocation");
} else if (error.message.includes('RuntimeError')) {
console.error("WASM runtime error");
console.error("Possible stack overflow or memory corruption");
}
// Analyze function arguments
console.log("Argument analysis:");
args.forEach((arg, index) => {
if (arg === null || arg === undefined) {
console.warn(` Arg ${index}: null/undefined - potential issue`);
} else if (Array.isArray(arg)) {
console.log(` Arg ${index}: Array of length ${arg.length}`);
if (arg.length === 0) {
console.warn(` Empty array may cause issues`);
}
if (arg.some(x => !isFinite(x))) {
console.warn(` Contains non-finite values (NaN/Infinity)`);
}
} else if (typeof arg === 'number') {
if (!isFinite(arg)) {
console.warn(` Arg ${index}: Non-finite number (${arg})`);
}
}
});
console.groupEnd();
}
// Generate performance report
generatePerformanceReport() {
if (this.performance_log.length === 0) {
console.log("No performance data available");
return;
}
console.group("๐ WASM Performance Report");
// Group by function
const functionStats = {};
for (const entry of this.performance_log) {
if (!functionStats[entry.function]) {
functionStats[entry.function] = {
calls: 0,
totalTime: 0,
minTime: Infinity,
maxTime: 0,
errors: 0
};
}
const stats = functionStats[entry.function];
stats.calls++;
stats.totalTime += entry.duration;
stats.minTime = Math.min(stats.minTime, entry.duration);
stats.maxTime = Math.max(stats.maxTime, entry.duration);
}
// Add error counts
for (const error of this.error_log) {
if (functionStats[error.function]) {
functionStats[error.function].errors++;
}
}
// Display stats
console.table(Object.entries(functionStats).map(([func, stats]) => ({
Function: func,
Calls: stats.calls,
'Avg Time (ms)': (stats.totalTime / stats.calls).toFixed(2),
'Min Time (ms)': stats.minTime.toFixed(2),
'Max Time (ms)': stats.maxTime.toFixed(2),
'Total Time (ms)': stats.totalTime.toFixed(2),
Errors: stats.errors,
'Error Rate (%)': ((stats.errors / stats.calls) * 100).toFixed(1)
})));
console.groupEnd();
}
// Test WASM module integrity
testModuleIntegrity() {
console.group("๐งช WASM Module Integrity Test");
const tests = [
{
name: "Memory allocation",
test: () => {
const initialPages = this.memory.buffer.byteLength / 65536;
// Try to allocate some memory
const testArray = new Float32Array(1000);
testArray.fill(1.0);
return testArray.every(x => x === 1.0);
}
},
{
name: "Basic arithmetic",
test: () => {
// Test if WASM can handle basic operations
if (this.wasmModule.test_add) {
return this.wasmModule.test_add(2, 3) === 5;
}
return true; // Skip if function not available
}
},
{
name: "Array processing",
test: () => {
// Test array handling
if (this.wasmModule.process_vector) {
const input = new Float32Array([1, 2, 3, 4]);
const result = this.wasmModule.process_vector(input);
return result && result.length > 0;
}
return true; // Skip if function not available
}
}
];
let passed = 0;
for (const test of tests) {
try {
if (test.test()) {
console.log(`โ
${test.name}: PASSED`);
passed++;
} else {
console.error(`โ ${test.name}: FAILED`);
}
} catch (error) {
console.error(`โ ${test.name}: ERROR - ${error.message}`);
}
}
console.log(`Test Results: ${passed}/${tests.length} passed`);
console.groupEnd();
return passed === tests.length;
}
}
// Usage example
async function debugWasmIntegration() {
try {
console.log("๐ Loading WASM module...");
const wasmModule = await import('./pkg/semantic_cartan_matrix.js');
await wasmModule.default(); // Initialize WASM
const debugger = new WasmDebugger(wasmModule);
// Test module integrity
const integrityOk = debugger.testModuleIntegrity();
if (!integrityOk) {
console.error("โ WASM module integrity test failed");
return;
}
// Wrap functions for debugging
const processor = new wasmModule.WasmNeuralProcessor();
const wrappedProcess = debugger.wrapFunction('process_vector', processor.process_vector.bind(processor));
// Monitor memory
const memoryInfo = debugger.monitorMemory();
// Test with sample data
const testData = new Float32Array(32);
for (let i = 0; i < 32; i++) {
testData[i] = Math.random();
}
console.log("๐งช Testing WASM processing...");
const result = wrappedProcess(testData);
console.log("Result:", result);
// Generate performance report
setTimeout(() => {
debugger.generatePerformanceReport();
}, 1000);
} catch (error) {
console.error("โ WASM debugging failed:", error);
// Provide debugging suggestions
console.group("๐ง Debugging Suggestions");
if (error.message.includes('Module not found')) {
console.error("WASM module not found. Check:");
console.error(" - Build completed successfully");
console.error(" - File paths are correct");
console.error(" - Module is accessible from current location");
} else if (error.message.includes('WebAssembly')) {
console.error("WASM runtime error. Check:");
console.error(" - Browser supports WebAssembly");
console.error(" - WASM binary is not corrupted");
console.error(" - Memory limits not exceeded");
}
console.groupEnd();
}
}
// Auto-run debugging when script loads
if (typeof window !== 'undefined') {
window.debugWasmIntegration = debugWasmIntegration;
console.log("๐ง WASM debugger loaded. Run debugWasmIntegration() to test.");
}
This comprehensive debugging guide provides systematic approaches to identifying and resolving issues across all layers of the ruv-FANN system, from neural network training problems to distributed coordination failures.