Custom Collectors - antimetal/system-agent GitHub Wiki
Custom Collectors
This guide explains how to build custom performance collectors for the Antimetal System Agent. Custom collectors allow you to extend the agent's monitoring capabilities for your specific needs.
Overview
The collector system provides two interfaces:
- PointCollector: One-shot data collection
- ContinuousCollector: Streaming data collection with lifecycle management
Most collectors implement PointCollector and are wrapped for continuous collection.
Basic Collector Structure
Step 1: Define Your Data Type
// pkg/performance/types.go
package performance
// Define your metric type
const MetricTypeCustom MetricType = "custom"
// Define your data structure
type CustomStats struct {
Timestamp time.Time `json:"timestamp"`
Values map[string]float64 `json:"values"`
Metadata map[string]string `json:"metadata"`
SampleCount int `json:"sample_count"`
}
Step 2: Implement the Collector
// pkg/performance/collectors/custom_collector.go
package collectors
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/go-logr/logr"
"github.com/antimetal/system-agent/pkg/performance"
)
// Compile-time interface check
var _ performance.PointCollector = (*CustomCollector)(nil)
type CustomCollector struct {
performance.BaseCollector
dataPath string
configPath string
}
// Constructor with standard pattern
func NewCustomCollector(logger logr.Logger, config performance.CollectionConfig) (*CustomCollector, error) {
// Validate paths are absolute
if !filepath.IsAbs(config.HostProcPath) {
return nil, fmt.Errorf("HostProcPath must be absolute: %q", config.HostProcPath)
}
// Define collector capabilities
capabilities := performance.CollectorCapabilities{
SupportsOneShot: true,
SupportsContinuous: false,
RequiresRoot: false,
RequiresEBPF: false,
MinKernelVersion: "3.10.0",
}
// Create collector instance
return &CustomCollector{
BaseCollector: performance.NewBaseCollector(
performance.MetricTypeCustom,
"custom",
logger,
config,
capabilities,
),
dataPath: filepath.Join(config.HostProcPath, "custom_data"),
configPath: filepath.Join(config.HostSysPath, "custom/config"),
}, nil
}
// Implement the Collect method
func (c *CustomCollector) Collect(ctx context.Context) (any, error) {
c.Logger().V(1).Info("Collecting custom metrics")
// Check context cancellation
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
// Read data from filesystem
data, err := c.readData()
if err != nil {
return nil, fmt.Errorf("failed to read data: %w", err)
}
// Parse and process data
stats, err := c.parseData(data)
if err != nil {
return nil, fmt.Errorf("failed to parse data: %w", err)
}
// Add timestamp
stats.Timestamp = time.Now()
c.Logger().V(2).Info("Custom metrics collected",
"sample_count", stats.SampleCount)
return stats, nil
}
// Private helper methods
func (c *CustomCollector) readData() ([]byte, error) {
// Read critical file - return error if missing
data, err := os.ReadFile(c.dataPath)
if err != nil {
return nil, fmt.Errorf("failed to read %s: %w", c.dataPath, err)
}
return data, nil
}
func (c *CustomCollector) parseData(data []byte) (*performance.CustomStats, error) {
stats := &performance.CustomStats{
Values: make(map[string]float64),
Metadata: make(map[string]string),
}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
// Parse your data format
parts := strings.Fields(line)
if len(parts) >= 2 {
key := parts[0]
value, err := strconv.ParseFloat(parts[1], 64)
if err != nil {
c.Logger().V(2).Info("Failed to parse value",
"key", key, "error", err)
continue
}
stats.Values[key] = value
stats.SampleCount++
}
}
// Read optional metadata
if metadata, err := c.readMetadata(); err == nil {
stats.Metadata = metadata
} else {
c.Logger().V(2).Info("Metadata unavailable", "error", err)
}
return stats, nil
}
func (c *CustomCollector) readMetadata() (map[string]string, error) {
metadata := make(map[string]string)
// Read optional configuration
if data, err := os.ReadFile(c.configPath); err == nil {
// Parse configuration
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if parts := strings.SplitN(line, "=", 2); len(parts) == 2 {
metadata[parts[0]] = parts[1]
}
}
}
return metadata, nil
}
// Register the collector
func init() {
performance.Register(
performance.MetricTypeCustom,
performance.PartialNewContinuousPointCollector(
func(logger logr.Logger, config performance.CollectionConfig) (performance.PointCollector, error) {
return NewCustomCollector(logger, config)
},
),
)
}
Step 3: Write Tests
// pkg/performance/collectors/custom_collector_test.go
package collectors_test
import (
"context"
"os"
"path/filepath"
"testing"
"github.com/go-logr/logr"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/antimetal/system-agent/pkg/performance"
"github.com/antimetal/system-agent/pkg/performance/collectors"
)
// Test constructor validation
func TestCustomCollector_Constructor(t *testing.T) {
tests := []struct {
name string
config performance.CollectionConfig
wantError bool
errorMsg string
}{
{
name: "valid absolute paths",
config: performance.CollectionConfig{
HostProcPath: "/proc",
HostSysPath: "/sys",
},
wantError: false,
},
{
name: "relative proc path",
config: performance.CollectionConfig{
HostProcPath: "proc",
HostSysPath: "/sys",
},
wantError: true,
errorMsg: "HostProcPath must be absolute",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := collectors.NewCustomCollector(logr.Discard(), tt.config)
if tt.wantError {
assert.Error(t, err)
assert.Contains(t, err.Error(), tt.errorMsg)
} else {
assert.NoError(t, err)
}
})
}
}
// Test data collection
func TestCustomCollector_Collect(t *testing.T) {
tests := []struct {
name string
procContent string
sysContent string
wantError bool
validate func(t *testing.T, result any)
}{
{
name: "valid data",
procContent: `metric1 100.5
metric2 200.0
metric3 300.75`,
sysContent: "version=1.0.0\nmode=production",
validate: func(t *testing.T, result any) {
stats, ok := result.(*performance.CustomStats)
require.True(t, ok, "result should be CustomStats")
assert.Equal(t, 3, stats.SampleCount)
assert.Equal(t, 100.5, stats.Values["metric1"])
assert.Equal(t, 200.0, stats.Values["metric2"])
assert.Equal(t, 300.75, stats.Values["metric3"])
assert.Equal(t, "1.0.0", stats.Metadata["version"])
assert.Equal(t, "production", stats.Metadata["mode"])
},
},
{
name: "empty data",
procContent: "",
validate: func(t *testing.T, result any) {
stats, ok := result.(*performance.CustomStats)
require.True(t, ok)
assert.Equal(t, 0, stats.SampleCount)
assert.Empty(t, stats.Values)
},
},
{
name: "missing file",
wantError: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
collector, procPath, sysPath := createTestCustomCollector(t, tt.procContent, tt.sysContent)
result, err := collector.Collect(context.Background())
if tt.wantError {
assert.Error(t, err)
return
}
require.NoError(t, err)
if tt.validate != nil {
tt.validate(t, result)
}
})
}
}
// Helper function to create test collector
func createTestCustomCollector(t *testing.T, procContent, sysContent string) (*collectors.CustomCollector, string, string) {
tmpDir := t.TempDir()
procPath := filepath.Join(tmpDir, "proc")
sysPath := filepath.Join(tmpDir, "sys")
// Create directory structure
require.NoError(t, os.MkdirAll(procPath, 0755))
require.NoError(t, os.MkdirAll(filepath.Join(sysPath, "custom"), 0755))
// Write test data if provided
if procContent != "" {
dataPath := filepath.Join(procPath, "custom_data")
require.NoError(t, os.WriteFile(dataPath, []byte(procContent), 0644))
}
if sysContent != "" {
configPath := filepath.Join(sysPath, "custom/config")
require.NoError(t, os.WriteFile(configPath, []byte(sysContent), 0644))
}
config := performance.CollectionConfig{
HostProcPath: procPath,
HostSysPath: sysPath,
}
collector, err := collectors.NewCustomCollector(logr.Discard(), config)
require.NoError(t, err)
return collector, procPath, sysPath
}
// Test context cancellation
func TestCustomCollector_ContextCancellation(t *testing.T) {
collector, _, _ := createTestCustomCollector(t, "test data", "")
ctx, cancel := context.WithCancel(context.Background())
cancel() // Cancel immediately
_, err := collector.Collect(ctx)
assert.Error(t, err)
assert.Equal(t, context.Canceled, err)
}
Advanced Collector Patterns
Continuous Collector
For collectors that need lifecycle management:
type AdvancedCollector struct {
performance.BaseContinuousCollector
// Collector state
client *CustomClient
interval time.Duration
ch chan any
stopped chan struct{}
// Configuration
endpoint string
bufferSize int
}
func NewAdvancedCollector(logger logr.Logger, config performance.CollectionConfig) (*AdvancedCollector, error) {
return &AdvancedCollector{
BaseContinuousCollector: performance.NewBaseContinuousCollector(
performance.MetricTypeAdvanced,
"advanced",
logger,
config,
performance.CollectorCapabilities{
SupportsOneShot: false,
SupportsContinuous: true,
RequiresRoot: true,
RequiresEBPF: false,
},
),
interval: 30 * time.Second,
bufferSize: 100,
endpoint: config.CustomEndpoint,
}, nil
}
func (c *AdvancedCollector) Start(ctx context.Context) (<-chan any, error) {
if c.Status() != performance.CollectorStatusDisabled {
return nil, fmt.Errorf("collector already running")
}
// Initialize client
client, err := NewCustomClient(c.endpoint)
if err != nil {
return nil, fmt.Errorf("failed to create client: %w", err)
}
c.client = client
// Create channels
c.ch = make(chan any, c.bufferSize)
c.stopped = make(chan struct{})
// Start collection goroutine
c.SetStatus(performance.CollectorStatusActive)
go c.runCollection(ctx)
return c.ch, nil
}
func (c *AdvancedCollector) Stop() error {
if c.Status() == performance.CollectorStatusDisabled {
return nil
}
// Signal stop
if c.stopped != nil {
close(c.stopped)
c.stopped = nil
}
// Cleanup client
if c.client != nil {
c.client.Close()
c.client = nil
}
// Give goroutine time to exit
time.Sleep(10 * time.Millisecond)
// Close channel
if c.ch != nil {
close(c.ch)
c.ch = nil
}
c.SetStatus(performance.CollectorStatusDisabled)
return nil
}
func (c *AdvancedCollector) runCollection(ctx context.Context) {
ticker := time.NewTicker(c.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-c.stopped:
return
case <-ticker.C:
data, err := c.client.Fetch()
if err != nil {
c.Logger().Error(err, "Failed to fetch data")
c.SetError(err)
continue
}
select {
case c.ch <- data:
c.ClearError()
case <-ctx.Done():
return
case <-c.stopped:
return
default:
c.Logger().V(1).Info("Channel full, dropping data")
}
}
}
}
eBPF-Based Collector
For kernel-level monitoring:
//go:build linux
package collectors
import (
_ "embed"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/link"
"github.com/cilium/ebpf/perf"
)
//go:embed syscall_monitor.bpf.o
var syscallMonitorProgram []byte
type SyscallCollector struct {
performance.BaseContinuousCollector
// eBPF objects
collection *ebpf.Collection
perfReader *perf.Reader
links []link.Link
}
func (c *SyscallCollector) Start(ctx context.Context) (<-chan any, error) {
// Load eBPF program
spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(syscallMonitorProgram))
if err != nil {
return nil, fmt.Errorf("failed to load eBPF spec: %w", err)
}
coll, err := ebpf.NewCollection(spec)
if err != nil {
return nil, fmt.Errorf("failed to create eBPF collection: %w", err)
}
c.collection = coll
// Attach to tracepoints
tp, err := link.Tracepoint("syscalls", "sys_enter_open", coll.Programs["trace_open"])
if err != nil {
return nil, fmt.Errorf("failed to attach tracepoint: %w", err)
}
c.links = append(c.links, tp)
// Create perf event reader
reader, err := perf.NewReader(coll.Maps["events"], 4096)
if err != nil {
return nil, fmt.Errorf("failed to create perf reader: %w", err)
}
c.perfReader = reader
// Start reading events
ch := make(chan any)
go c.readEvents(ctx, ch)
return ch, nil
}
func (c *SyscallCollector) readEvents(ctx context.Context, ch chan<- any) {
defer close(ch)
for {
select {
case <-ctx.Done():
return
default:
}
record, err := c.perfReader.Read()
if err != nil {
if errors.Is(err, perf.ErrClosed) {
return
}
c.Logger().Error(err, "Failed to read perf event")
continue
}
// Parse event data
event := parseSyscallEvent(record.RawSample)
select {
case ch <- event:
case <-ctx.Done():
return
}
}
}
Aggregating Collector
For collectors that aggregate data:
type AggregatingCollector struct {
performance.BaseCollector
// Aggregation state
mu sync.Mutex
accumulator map[string]*Accumulator
lastFlush time.Time
flushPeriod time.Duration
}
type Accumulator struct {
Count int64
Sum float64
Min float64
Max float64
Samples []float64
}
func (c *AggregatingCollector) AddSample(key string, value float64) {
c.mu.Lock()
defer c.mu.Unlock()
acc, exists := c.accumulator[key]
if !exists {
acc = &Accumulator{
Min: value,
Max: value,
}
c.accumulator[key] = acc
}
acc.Count++
acc.Sum += value
if value < acc.Min {
acc.Min = value
}
if value > acc.Max {
acc.Max = value
}
// Keep last N samples for percentiles
if len(acc.Samples) < 100 {
acc.Samples = append(acc.Samples, value)
}
}
func (c *AggregatingCollector) Collect(ctx context.Context) (any, error) {
c.mu.Lock()
defer c.mu.Unlock()
// Check if we should flush
if time.Since(c.lastFlush) < c.flushPeriod {
return nil, performance.ErrNotReady
}
// Build aggregated stats
stats := &AggregatedStats{
Timestamp: time.Now(),
Period: c.flushPeriod,
Metrics: make(map[string]*MetricSummary),
}
for key, acc := range c.accumulator {
summary := &MetricSummary{
Count: acc.Count,
Sum: acc.Sum,
Average: acc.Sum / float64(acc.Count),
Min: acc.Min,
Max: acc.Max,
}
// Calculate percentiles
if len(acc.Samples) > 0 {
sort.Float64s(acc.Samples)
summary.P50 = percentile(acc.Samples, 0.5)
summary.P95 = percentile(acc.Samples, 0.95)
summary.P99 = percentile(acc.Samples, 0.99)
}
stats.Metrics[key] = summary
}
// Reset accumulator
c.accumulator = make(map[string]*Accumulator)
c.lastFlush = time.Now()
return stats, nil
}
Best Practices
1. Error Handling
func (c *CustomCollector) Collect(ctx context.Context) (any, error) {
// Distinguish between critical and optional data
// Critical data - fail if unavailable
criticalData, err := c.readCriticalData()
if err != nil {
return nil, fmt.Errorf("critical data unavailable: %w", err)
}
stats := processData(criticalData)
// Optional data - log but continue
if optionalData, err := c.readOptionalData(); err == nil {
enrichStats(stats, optionalData)
} else {
c.Logger().V(2).Info("Optional data unavailable", "error", err)
}
return stats, nil
}
2. Resource Management
type ResourceCollector struct {
performance.BaseCollector
// Resources that need cleanup
file *os.File
client *http.Client
cancel context.CancelFunc
}
func (c *ResourceCollector) Collect(ctx context.Context) (any, error) {
// Ensure cleanup on all paths
defer func() {
if c.file != nil {
c.file.Close()
}
}()
// Open resources
file, err := os.Open(c.path)
if err != nil {
return nil, err
}
c.file = file
// Use defer for cleanup
defer file.Close()
// Process data...
}
3. Performance Optimization
// Pre-allocate structures
func (c *CustomCollector) Collect(ctx context.Context) (any, error) {
// Estimate size to avoid reallocations
stats := &CustomStats{
Values: make(map[string]float64, 100),
}
// Use buffered I/O for large files
file, err := os.Open(c.dataPath)
if err != nil {
return nil, err
}
defer file.Close()
reader := bufio.NewReaderSize(file, 64*1024)
// Process efficiently...
}
4. Testing Patterns
// Table-driven tests
func TestCollector_EdgeCases(t *testing.T) {
tests := []struct {
name string
setup func() *CustomCollector
wantError bool
validate func(t *testing.T, result any)
}{
{
name: "handles malformed data",
setup: func() *CustomCollector {
return createCollectorWithData("invalid\ndata\n")
},
wantError: false,
validate: func(t *testing.T, result any) {
stats := result.(*CustomStats)
assert.Empty(t, stats.Values)
},
},
{
name: "handles huge values",
setup: func() *CustomCollector {
return createCollectorWithData("metric 9223372036854775807")
},
validate: func(t *testing.T, result any) {
stats := result.(*CustomStats)
assert.Equal(t, float64(math.MaxInt64), stats.Values["metric"])
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
collector := tt.setup()
result, err := collector.Collect(context.Background())
if tt.wantError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
if tt.validate != nil {
tt.validate(t, result)
}
}
})
}
}
Integration with Agent
Configuration
Add your collector to the configuration:
performance:
collectors:
- cpu
- memory
- custom # Your collector
settings:
custom:
# Collector-specific settings
endpoint: "http://custom-service:8080"
timeout: "30s"
Metrics Export
Your collector's data will be available through the agent's metrics:
# HELP antimetal_custom_values Custom collector metrics
# TYPE antimetal_custom_values gauge
antimetal_custom_values{metric="metric1"} 100.5
antimetal_custom_values{metric="metric2"} 200.0
Debugging
Enable debug logging for your collector:
--log-verbosity=custom:3
Examples
System Service Monitor
Monitor systemd services:
type ServiceCollector struct {
performance.BaseCollector
}
func (c *ServiceCollector) Collect(ctx context.Context) (any, error) {
cmd := exec.CommandContext(ctx, "systemctl", "list-units", "--type=service", "--no-pager", "--plain")
output, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("failed to list services: %w", err)
}
stats := &ServiceStats{
Services: make(map[string]ServiceStatus),
}
scanner := bufio.NewScanner(bytes.NewReader(output))
for scanner.Scan() {
line := scanner.Text()
if service := parseServiceLine(line); service != nil {
stats.Services[service.Name] = *service
}
}
return stats, nil
}
Database Connection Pool Monitor
type DBPoolCollector struct {
performance.BaseCollector
db *sql.DB
}
func (c *DBPoolCollector) Collect(ctx context.Context) (any, error) {
stats := c.db.Stats()
return &DBPoolStats{
OpenConnections: stats.OpenConnections,
InUse: stats.InUse,
Idle: stats.Idle,
WaitCount: stats.WaitCount,
WaitDuration: stats.WaitDuration,
MaxIdleClosed: stats.MaxIdleClosed,
MaxLifetimeClosed: stats.MaxLifetimeClosed,
}, nil
}
Next Steps
- Performance Monitoring - Collector architecture
- Testing Guide - Testing best practices
- Contributing - Submit your collector
For more examples, see pkg/performance/collectors/