Hardware Development Guide - antimetal/system-agent GitHub Wiki
✅ COMPLETE: This guide covers the fully implemented hardware graph feature with comprehensive collector integration and all relationship types.
- Fork and clone the
antimetal/apis
repository for protobuf definitions - Update
buf.gen.yaml
to point to your local fork:
inputs:
- directory: ../jra3-apis/api
# Generate protobuf code
make proto
# Run tests
go test ./internal/hardware/... -v
# Format code
make fmt
# Add license headers
make gen-license-headers
Step 1: Define the protobuf message in jra3-apis/api/hardware/v1/hardware.proto
:
message GPUDeviceNode {
string device = 1;
string vendor = 2;
string model = 3;
uint64 memory_bytes = 4;
uint32 compute_capability = 5;
}
Step 2: Regenerate protobuf code:
make proto
Step 3: Add builder method in internal/hardware/graph/builder.go
:
func (b *Builder) createGPUDeviceNode(gpu *performance.GPUInfo) (*resourcev1.Resource, *resourcev1.ResourceRef, error) {
gpuSpec := &hardwarev1.GPUDeviceNode{
Device: gpu.Device,
Vendor: gpu.Vendor,
Model: gpu.Model,
MemoryBytes: gpu.MemoryBytes,
ComputeCapability: gpu.ComputeCapability,
}
specAny, err := anypb.New(gpuSpec)
if err != nil {
return nil, nil, fmt.Errorf("failed to marshal GPU spec: %w", err)
}
resource := &resourcev1.Resource{
Type: &resourcev1.TypeDescriptor{
Kind: "GPUDeviceNode",
Type: string(gpuSpec.ProtoReflect().Descriptor().FullName()),
},
Metadata: &resourcev1.ResourceMeta{
Provider: resourcev1.Provider_PROVIDER_ANTIMETAL,
ProviderId: gpu.Device,
Name: gpu.Device,
},
Spec: specAny,
}
ref := &resourcev1.ResourceRef{
TypeUrl: string(gpuSpec.ProtoReflect().Descriptor().FullName()),
Name: gpu.Device,
}
return resource, ref, nil
}
Step 4: Integrate into BuildFromSnapshot:
// In BuildFromSnapshot method
if len(snapshot.Metrics.GPUInfo) > 0 {
if err := b.buildGPUTopology(ctx, snapshot.Metrics.GPUInfo, systemRef); err != nil {
return fmt.Errorf("failed to build GPU topology: %w", err)
}
}
Step 1: Define the predicate in jra3-apis/api/hardware/v1/relationships.proto
:
message PCIeBusPredicate {
string bus_address = 1;
uint32 lane_width = 2;
uint32 link_speed_gts = 3; // Gigatransfers per second
}
Step 2: Create relationship builder:
func (b *Builder) createPCIeRelationship(device, bus *resourcev1.ResourceRef, address string) error {
predicate := &hardwarev1.PCIeBusPredicate{
BusAddress: address,
}
predicateAny, err := anypb.New(predicate)
if err != nil {
return fmt.Errorf("failed to marshal PCIe predicate: %w", err)
}
rel := &resourcev1.Relationship{
Subject: device,
Predicate: predicateAny,
Object: bus,
}
return b.store.AddRelationships(rel)
}
Step 1: Create collector in pkg/performance/collectors/
:
type GPUInfoCollector struct {
performance.BasePointCollector
}
func NewGPUInfoCollector(logger logr.Logger, config performance.CollectionConfig) (*GPUInfoCollector, error) {
return &GPUInfoCollector{
BasePointCollector: performance.NewBasePointCollector(
performance.MetricTypeGPUInfo,
"gpu_info",
logger,
config,
performance.CollectorCapabilities{
SupportsOneShot: true,
SupportsContinuous: false,
},
),
}, nil
}
func (c *GPUInfoCollector) Collect(ctx context.Context) (interface{}, error) {
// Read from /sys/class/drm or nvidia-smi
gpus := []performance.GPUInfo{}
// Parse GPU information
// ...
return gpus, nil
}
Step 2: Register collector in pkg/performance/registry.go
:
func init() {
RegisterCollector(MetricTypeGPUInfo, func(logger logr.Logger, config CollectionConfig) (Collector, error) {
return NewGPUInfoCollector(logger, config)
})
}
func TestBuilder_BuildGPUTopology(t *testing.T) {
ctx := context.Background()
mockStore := newMockStore()
builder := graph.NewBuilder(logr.Discard(), mockStore)
gpuInfo := []performance.GPUInfo{
{
Device: "gpu0",
Vendor: "NVIDIA",
Model: "RTX 3090",
MemoryBytes: 24 * 1024 * 1024 * 1024, // 24GB
},
}
err := builder.buildGPUTopology(ctx, gpuInfo, systemRef)
require.NoError(t, err)
// Verify nodes were created
assert.Len(t, mockStore.resources, 1)
// Verify relationships
assert.Len(t, mockStore.relationships, 1)
}
//go:build integration
func TestHardwareManager_GPUDiscovery(t *testing.T) {
// Requires actual GPU hardware
if !hasGPU() {
t.Skip("No GPU detected")
}
manager := setupTestManager(t)
err := manager.Start()
require.NoError(t, err)
defer manager.Stop()
// Wait for collection
time.Sleep(100 * time.Millisecond)
// Verify GPU nodes in store
// ...
}
- Type safety: Strongly typed hardware specifications
- Versioning: Forward/backward compatibility
- Performance: Efficient serialization
- Language agnostic: Can generate clients for any language
- Flexibility: Add new relationship types without schema changes
- Graph queries: Natural representation for hardware topology
- Standards-based: Uses W3C RDF concepts
- Info: Hardware configuration (immutable, collected rarely)
- Stats: Runtime metrics (changes frequently)
- Optimization: Different collection intervals for different data
func NewXCollector(logger logr.Logger, config CollectionConfig) (*XCollector, error) {
// Validate environment
if !fileExists("/sys/class/X") {
return nil, fmt.Errorf("X subsystem not available")
}
// Initialize base
return &XCollector{
BasePointCollector: NewBasePointCollector(...),
}, nil
}
// Graceful degradation for optional features
if err := readOptionalField(); err != nil {
c.logger.V(1).Info("Optional field not available", "error", err)
// Continue without the field
}
// Fatal errors for required data
if err := readRequiredField(); err != nil {
return nil, fmt.Errorf("failed to read required field: %w", err)
}
func TestCollector_WithMockFS(t *testing.T) {
fs := afero.NewMemMapFs()
afero.WriteFile(fs, "/proc/cpuinfo", []byte(testCPUInfo), 0644)
collector := NewCollectorWithFS(fs)
data, err := collector.Collect(context.Background())
require.NoError(t, err)
// Assertions...
}
Protobuf generation fails:
# Ensure buf is installed
go install github.com/bufbuild/buf/cmd/buf@latest
# Check buf.yaml configuration
buf mod update
Tests fail with "file not found":
# Run tests on Linux or use mock filesystems
go test -tags=unit # Exclude integration tests
Hardware not detected:
# Check kernel modules
lsmod | grep <driver>
# Check sysfs availability
ls -la /sys/class/
type CachedCollector struct {
BaseCollector
cache interface{}
cacheTime time.Time
cacheTTL time.Duration
}
func (c *CachedCollector) Collect(ctx context.Context) (interface{}, error) {
if time.Since(c.cacheTime) < c.cacheTTL {
return c.cache, nil
}
data, err := c.collectFresh(ctx)
if err != nil {
return nil, err
}
c.cache = data
c.cacheTime = time.Now()
return data, nil
}
// Read multiple files in single syscall
func readBatch(paths []string) (map[string][]byte, error) {
results := make(map[string][]byte)
for _, path := range paths {
data, err := os.ReadFile(path)
if err != nil {
// Log but continue
continue
}
results[path] = data
}
return results, nil
}
- Protobuf definitions follow naming conventions
- Builder methods handle errors gracefully
- Unit tests cover happy path and error cases
- Integration tests marked with build tag
- Documentation updated for new node types
- License headers added to new files
- Create feature branch from
main
- Make changes following patterns above
- Run tests:
make test
- Format code:
make fmt
- Add license headers:
make gen-license-headers
- Submit PR with clear description
This document was migrated from the repository docs. Last updated: 2025-01-19