Hardware Development Guide - antimetal/system-agent GitHub Wiki

Hardware Development Guide

COMPLETE: This guide covers the fully implemented hardware graph feature with comprehensive collector integration and all relationship types.

Quick Start

Prerequisites

  1. Fork and clone the antimetal/apis repository for protobuf definitions
  2. Update buf.gen.yaml to point to your local fork:
inputs:
  - directory: ../jra3-apis/api

Build and Test

# Generate protobuf code
make proto

# Run tests
go test ./internal/hardware/... -v

# Format code
make fmt

# Add license headers
make gen-license-headers

Development Workflow

1. Adding New Hardware Node Types

Step 1: Define the protobuf message in jra3-apis/api/hardware/v1/hardware.proto:

message GPUDeviceNode {
  string device = 1;
  string vendor = 2;
  string model = 3;
  uint64 memory_bytes = 4;
  uint32 compute_capability = 5;
}

Step 2: Regenerate protobuf code:

make proto

Step 3: Add builder method in internal/hardware/graph/builder.go:

func (b *Builder) createGPUDeviceNode(gpu *performance.GPUInfo) (*resourcev1.Resource, *resourcev1.ResourceRef, error) {
    gpuSpec := &hardwarev1.GPUDeviceNode{
        Device:            gpu.Device,
        Vendor:            gpu.Vendor,
        Model:             gpu.Model,
        MemoryBytes:       gpu.MemoryBytes,
        ComputeCapability: gpu.ComputeCapability,
    }
    
    specAny, err := anypb.New(gpuSpec)
    if err != nil {
        return nil, nil, fmt.Errorf("failed to marshal GPU spec: %w", err)
    }
    
    resource := &resourcev1.Resource{
        Type: &resourcev1.TypeDescriptor{
            Kind: "GPUDeviceNode",
            Type: string(gpuSpec.ProtoReflect().Descriptor().FullName()),
        },
        Metadata: &resourcev1.ResourceMeta{
            Provider:   resourcev1.Provider_PROVIDER_ANTIMETAL,
            ProviderId: gpu.Device,
            Name:       gpu.Device,
        },
        Spec: specAny,
    }
    
    ref := &resourcev1.ResourceRef{
        TypeUrl: string(gpuSpec.ProtoReflect().Descriptor().FullName()),
        Name:    gpu.Device,
    }
    
    return resource, ref, nil
}

Step 4: Integrate into BuildFromSnapshot:

// In BuildFromSnapshot method
if len(snapshot.Metrics.GPUInfo) > 0 {
    if err := b.buildGPUTopology(ctx, snapshot.Metrics.GPUInfo, systemRef); err != nil {
        return fmt.Errorf("failed to build GPU topology: %w", err)
    }
}

2. Adding New Relationship Types

Step 1: Define the predicate in jra3-apis/api/hardware/v1/relationships.proto:

message PCIeBusPredicate {
  string bus_address = 1;
  uint32 lane_width = 2;
  uint32 link_speed_gts = 3;  // Gigatransfers per second
}

Step 2: Create relationship builder:

func (b *Builder) createPCIeRelationship(device, bus *resourcev1.ResourceRef, address string) error {
    predicate := &hardwarev1.PCIeBusPredicate{
        BusAddress: address,
    }
    
    predicateAny, err := anypb.New(predicate)
    if err != nil {
        return fmt.Errorf("failed to marshal PCIe predicate: %w", err)
    }
    
    rel := &resourcev1.Relationship{
        Subject:   device,
        Predicate: predicateAny,
        Object:    bus,
    }
    
    return b.store.AddRelationships(rel)
}

3. Adding Performance Collectors

Step 1: Create collector in pkg/performance/collectors/:

type GPUInfoCollector struct {
    performance.BasePointCollector
}

func NewGPUInfoCollector(logger logr.Logger, config performance.CollectionConfig) (*GPUInfoCollector, error) {
    return &GPUInfoCollector{
        BasePointCollector: performance.NewBasePointCollector(
            performance.MetricTypeGPUInfo,
            "gpu_info",
            logger,
            config,
            performance.CollectorCapabilities{
                SupportsOneShot:    true,
                SupportsContinuous: false,
            },
        ),
    }, nil
}

func (c *GPUInfoCollector) Collect(ctx context.Context) (interface{}, error) {
    // Read from /sys/class/drm or nvidia-smi
    gpus := []performance.GPUInfo{}
    
    // Parse GPU information
    // ...
    
    return gpus, nil
}

Step 2: Register collector in pkg/performance/registry.go:

func init() {
    RegisterCollector(MetricTypeGPUInfo, func(logger logr.Logger, config CollectionConfig) (Collector, error) {
        return NewGPUInfoCollector(logger, config)
    })
}

Testing

Unit Tests

func TestBuilder_BuildGPUTopology(t *testing.T) {
    ctx := context.Background()
    mockStore := newMockStore()
    builder := graph.NewBuilder(logr.Discard(), mockStore)
    
    gpuInfo := []performance.GPUInfo{
        {
            Device:      "gpu0",
            Vendor:      "NVIDIA",
            Model:       "RTX 3090",
            MemoryBytes: 24 * 1024 * 1024 * 1024, // 24GB
        },
    }
    
    err := builder.buildGPUTopology(ctx, gpuInfo, systemRef)
    require.NoError(t, err)
    
    // Verify nodes were created
    assert.Len(t, mockStore.resources, 1)
    
    // Verify relationships
    assert.Len(t, mockStore.relationships, 1)
}

Integration Tests

//go:build integration

func TestHardwareManager_GPUDiscovery(t *testing.T) {
    // Requires actual GPU hardware
    if !hasGPU() {
        t.Skip("No GPU detected")
    }
    
    manager := setupTestManager(t)
    err := manager.Start()
    require.NoError(t, err)
    defer manager.Stop()
    
    // Wait for collection
    time.Sleep(100 * time.Millisecond)
    
    // Verify GPU nodes in store
    // ...
}

Architecture Decisions

Why Protobuf?

  • Type safety: Strongly typed hardware specifications
  • Versioning: Forward/backward compatibility
  • Performance: Efficient serialization
  • Language agnostic: Can generate clients for any language

Why RDF Triplets?

  • Flexibility: Add new relationship types without schema changes
  • Graph queries: Natural representation for hardware topology
  • Standards-based: Uses W3C RDF concepts

Why Separate Info vs Stats?

  • Info: Hardware configuration (immutable, collected rarely)
  • Stats: Runtime metrics (changes frequently)
  • Optimization: Different collection intervals for different data

Common Patterns

Collector Initialization

func NewXCollector(logger logr.Logger, config CollectionConfig) (*XCollector, error) {
    // Validate environment
    if !fileExists("/sys/class/X") {
        return nil, fmt.Errorf("X subsystem not available")
    }
    
    // Initialize base
    return &XCollector{
        BasePointCollector: NewBasePointCollector(...),
    }, nil
}

Error Handling

// Graceful degradation for optional features
if err := readOptionalField(); err != nil {
    c.logger.V(1).Info("Optional field not available", "error", err)
    // Continue without the field
}

// Fatal errors for required data
if err := readRequiredField(); err != nil {
    return nil, fmt.Errorf("failed to read required field: %w", err)
}

Testing with Mock Filesystems

func TestCollector_WithMockFS(t *testing.T) {
    fs := afero.NewMemMapFs()
    afero.WriteFile(fs, "/proc/cpuinfo", []byte(testCPUInfo), 0644)
    
    collector := NewCollectorWithFS(fs)
    data, err := collector.Collect(context.Background())
    require.NoError(t, err)
    // Assertions...
}

Troubleshooting

Common Issues

Protobuf generation fails:

# Ensure buf is installed
go install github.com/bufbuild/buf/cmd/buf@latest

# Check buf.yaml configuration
buf mod update

Tests fail with "file not found":

# Run tests on Linux or use mock filesystems
go test -tags=unit  # Exclude integration tests

Hardware not detected:

# Check kernel modules
lsmod | grep <driver>

# Check sysfs availability
ls -la /sys/class/

Performance Optimization

Caching Strategies

type CachedCollector struct {
    BaseCollector
    cache     interface{}
    cacheTime time.Time
    cacheTTL  time.Duration
}

func (c *CachedCollector) Collect(ctx context.Context) (interface{}, error) {
    if time.Since(c.cacheTime) < c.cacheTTL {
        return c.cache, nil
    }
    
    data, err := c.collectFresh(ctx)
    if err != nil {
        return nil, err
    }
    
    c.cache = data
    c.cacheTime = time.Now()
    return data, nil
}

Batch Reading

// Read multiple files in single syscall
func readBatch(paths []string) (map[string][]byte, error) {
    results := make(map[string][]byte)
    
    for _, path := range paths {
        data, err := os.ReadFile(path)
        if err != nil {
            // Log but continue
            continue
        }
        results[path] = data
    }
    
    return results, nil
}

Contributing

Code Review Checklist

  • Protobuf definitions follow naming conventions
  • Builder methods handle errors gracefully
  • Unit tests cover happy path and error cases
  • Integration tests marked with build tag
  • Documentation updated for new node types
  • License headers added to new files

Submitting Changes

  1. Create feature branch from main
  2. Make changes following patterns above
  3. Run tests: make test
  4. Format code: make fmt
  5. Add license headers: make gen-license-headers
  6. Submit PR with clear description

References

Internal Documentation

External Resources


This document was migrated from the repository docs. Last updated: 2025-01-19

⚠️ **GitHub.com Fallback** ⚠️