Cloud Providers - antimetal/system-agent GitHub Wiki

Cloud Providers

The Cloud Provider abstraction allows the System Agent to discover and report cloud-specific metadata about the Kubernetes cluster. This extensible interface supports multiple cloud platforms and local development environments.

Provider Interface

The core interface is minimal and focused:

type Provider interface {
    // Name returns the provider identifier (e.g., "eks", "gke", "kind")
    Name() string
    
    // ClusterName discovers the cluster's name in the cloud provider
    ClusterName(ctx context.Context) (string, error)
    
    // Region returns the cloud region where the cluster runs
    Region(ctx context.Context) (string, error)
}

Architecture

graph TD
    subgraph PF["Provider Factory"]
        AD["Auto Detect"] --> PR["Provider Registry"]
        PR --> PI["Provider Instance"]
        
        subgraph AP["Available Providers"]
            EKS["EKS<br/>(AWS)"]
            GKE["GKE<br/>(Google)"]
            AKS["AKS<br/>(Azure)"]
            KIND["KIND<br/>(Local)"]
        end
        
        PR -.-> EKS
        PR -.-> GKE
        PR -.-> AKS
        PR -.-> KIND
    end
Loading

Implemented Providers

EKS (Amazon Elastic Kubernetes Service)

Full implementation with AWS integration:

type EKSProvider struct {
    ec2Client EC2DescribeInstancesAPI
    logger    logr.Logger
}

func NewEKSProvider(logger logr.Logger) (*EKSProvider, error) {
    cfg, err := config.LoadDefaultConfig(context.Background())
    if err != nil {
        return nil, fmt.Errorf("failed to load AWS config: %w", err)
    }
    
    return &EKSProvider{
        ec2Client: ec2.NewFromConfig(cfg),
        logger:    logger,
    }, nil
}

func (p *EKSProvider) Name() string {
    return "eks"
}

func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
    // Try multiple discovery methods
    
    // 1. Check CLUSTER_NAME environment variable
    if clusterName := os.Getenv("CLUSTER_NAME"); clusterName != "" {
        return clusterName, nil
    }
    
    // 2. Parse from kubeconfig cluster endpoint
    config, err := rest.InClusterConfig()
    if err == nil {
        // Extract from endpoint: https://xxx.region.eks.amazonaws.com
        if clusterName := parseEKSEndpoint(config.Host); clusterName != "" {
            return clusterName, nil
        }
    }
    
    // 3. Query EC2 tags on the node
    instanceID, err := p.getInstanceID()
    if err != nil {
        return "", err
    }
    
    tags, err := p.getInstanceTags(ctx, instanceID)
    if err != nil {
        return "", err
    }
    
    // Look for standard EKS cluster tag
    for k, v := range tags {
        if strings.HasPrefix(k, "kubernetes.io/cluster/") {
            parts := strings.Split(k, "/")
            if len(parts) >= 3 {
                return parts[2], nil
            }
        }
    }
    
    return "", fmt.Errorf("cluster name not found")
}

func (p *EKSProvider) Region(ctx context.Context) (string, error) {
    // Get from EC2 metadata service
    client := imds.NewFromConfig(aws.Config{})
    
    output, err := client.GetRegion(ctx, &imds.GetRegionInput{})
    if err != nil {
        return "", fmt.Errorf("failed to get region from IMDS: %w", err)
    }
    
    return output.Region, nil
}

KIND (Kubernetes IN Docker)

Local development provider:

type KindProvider struct {
    logger logr.Logger
}

func NewKindProvider(logger logr.Logger) *KindProvider {
    return &KindProvider{logger: logger}
}

func (p *KindProvider) Name() string {
    return "kind"
}

func (p *KindProvider) ClusterName(ctx context.Context) (string, error) {
    // KIND sets the cluster name in kubeconfig context
    loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
    configOverrides := &clientcmd.ConfigOverrides{}
    
    kubeConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
        loadingRules,
        configOverrides,
    )
    
    config, err := kubeConfig.RawConfig()
    if err != nil {
        return "", err
    }
    
    // KIND contexts are named "kind-{cluster-name}"
    currentContext := config.CurrentContext
    if strings.HasPrefix(currentContext, "kind-") {
        return strings.TrimPrefix(currentContext, "kind-"), nil
    }
    
    // Default KIND cluster name
    return "kind", nil
}

func (p *KindProvider) Region(ctx context.Context) (string, error) {
    // KIND is local, no region
    return "local", nil
}

Generic Provider

Fallback for unknown environments:

type GenericProvider struct {
    clusterName string
    logger      logr.Logger
}

func NewGenericProvider(clusterName string, logger logr.Logger) *GenericProvider {
    if clusterName == "" {
        clusterName = "unknown"
    }
    return &GenericProvider{
        clusterName: clusterName,
        logger:      logger,
    }
}

func (p *GenericProvider) Name() string {
    return "generic"
}

func (p *GenericProvider) ClusterName(ctx context.Context) (string, error) {
    return p.clusterName, nil
}

func (p *GenericProvider) Region(ctx context.Context) (string, error) {
    return "unknown", nil
}

Auto-Detection

The provider factory automatically detects the cloud environment:

func DetectProvider(ctx context.Context, logger logr.Logger) (Provider, error) {
    // 1. Check explicit provider override
    if providerName := os.Getenv("CLOUD_PROVIDER"); providerName != "" {
        return createProvider(providerName, logger)
    }
    
    // 2. Try to detect EKS
    if isEKS() {
        provider, err := NewEKSProvider(logger)
        if err == nil {
            return provider, nil
        }
        logger.Error(err, "Failed to create EKS provider")
    }
    
    // 3. Check for KIND
    if isKind() {
        return NewKindProvider(logger), nil
    }
    
    // 4. Check for GKE (future)
    if isGKE() {
        return NewGKEProvider(logger), nil
    }
    
    // 5. Check for AKS (future)
    if isAKS() {
        return NewAKSProvider(logger), nil
    }
    
    // 6. Fallback to generic
    logger.Info("No specific cloud provider detected, using generic")
    return NewGenericProvider("", logger), nil
}

// Detection functions
func isEKS() bool {
    // Check for EKS-specific indicators
    if _, err := os.Stat("/var/run/secrets/eks.amazonaws.com"); err == nil {
        return true
    }
    
    // Check if running on EC2
    client := imds.New(imds.Options{})
    _, err := client.GetInstanceIdentityDocument(context.Background(), nil)
    return err == nil
}

func isKind() bool {
    // KIND sets specific environment variables
    return os.Getenv("KUBERNETES_SERVICE_HOST") == "kind-control-plane" ||
           strings.Contains(os.Getenv("KUBERNETES_SERVICE_HOST"), ".kind")
}

func isGKE() bool {
    // Check for GKE metadata server
    resp, err := http.Get("http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name")
    if err == nil {
        resp.Body.Close()
        return resp.StatusCode == 200
    }
    return false
}

func isAKS() bool {
    // Check for Azure metadata service
    req, _ := http.NewRequest("GET", "http://169.254.169.254/metadata/instance?api-version=2021-02-01", nil)
    req.Header.Add("Metadata", "true")
    
    client := &http.Client{Timeout: 2 * time.Second}
    resp, err := client.Do(req)
    if err == nil {
        resp.Body.Close()
        return resp.StatusCode == 200
    }
    return false
}

Adding a New Provider

Step 1: Implement the Interface

package providers

import (
    "context"
    "github.com/go-logr/logr"
)

type MyCloudProvider struct {
    logger logr.Logger
    // Add cloud-specific clients
}

func NewMyCloudProvider(logger logr.Logger) (*MyCloudProvider, error) {
    // Initialize cloud SDK clients
    return &MyCloudProvider{
        logger: logger,
    }, nil
}

func (p *MyCloudProvider) Name() string {
    return "mycloud"
}

func (p *MyCloudProvider) ClusterName(ctx context.Context) (string, error) {
    // Implement cluster name discovery
    // 1. Check environment variables
    // 2. Query cloud metadata service
    // 3. Parse from node labels/annotations
    return "", nil
}

func (p *MyCloudProvider) Region(ctx context.Context) (string, error) {
    // Implement region discovery
    // Usually from metadata service
    return "", nil
}

Step 2: Add Detection Logic

func isMyCloud() bool {
    // Add detection logic
    // Examples:
    // - Check for specific files/directories
    // - Try to reach metadata service
    // - Check environment variables
    // - Examine node labels
    return false
}

Step 3: Register in Factory

func DetectProvider(ctx context.Context, logger logr.Logger) (Provider, error) {
    // ... existing detection logic ...
    
    // Add your provider
    if isMyCloud() {
        provider, err := NewMyCloudProvider(logger)
        if err == nil {
            return provider, nil
        }
        logger.Error(err, "Failed to create MyCloud provider")
    }
    
    // ... fallback logic ...
}

Step 4: Add Tests

func TestMyCloudProvider_ClusterName(t *testing.T) {
    tests := []struct {
        name        string
        setup       func()
        want        string
        wantErr     bool
    }{
        {
            name: "from environment variable",
            setup: func() {
                os.Setenv("CLUSTER_NAME", "test-cluster")
            },
            want: "test-cluster",
        },
        {
            name: "from metadata service",
            setup: func() {
                // Mock metadata service
            },
            want: "prod-cluster",
        },
    }
    
    for _, tt := range tests {
        t.Run(tt.name, func(t *testing.T) {
            tt.setup()
            
            provider := &MyCloudProvider{logger: logr.Discard()}
            got, err := provider.ClusterName(context.Background())
            
            if (err != nil) != tt.wantErr {
                t.Errorf("ClusterName() error = %v, wantErr %v", err, tt.wantErr)
            }
            if got != tt.want {
                t.Errorf("ClusterName() = %v, want %v", got, tt.want)
            }
        })
    }
}

Provider Metadata

Providers can expose additional metadata:

// Extended provider interface (optional)
type ExtendedProvider interface {
    Provider
    
    // Additional cloud-specific metadata
    InstanceType(ctx context.Context) (string, error)
    AvailabilityZone(ctx context.Context) (string, error)
    AccountID(ctx context.Context) (string, error)
}

// EKS implementation
func (p *EKSProvider) InstanceType(ctx context.Context) (string, error) {
    doc, err := p.getInstanceIdentityDocument(ctx)
    if err != nil {
        return "", err
    }
    return doc.InstanceType, nil
}

func (p *EKSProvider) AvailabilityZone(ctx context.Context) (string, error) {
    doc, err := p.getInstanceIdentityDocument(ctx)
    if err != nil {
        return "", err
    }
    return doc.AvailabilityZone, nil
}

func (p *EKSProvider) AccountID(ctx context.Context) (string, error) {
    doc, err := p.getInstanceIdentityDocument(ctx)
    if err != nil {
        return "", err
    }
    return doc.AccountID, nil
}

Configuration

Environment Variables

Variable Description Example
CLOUD_PROVIDER Force specific provider eks, gke, kind
CLUSTER_NAME Override cluster name prod-cluster
AWS_REGION AWS region (EKS) us-west-2

Command Line Flags

flags.String("cloud-provider", "", "Cloud provider (auto-detected if empty)")
flags.String("cluster-name", "", "Override cluster name")
flags.String("region", "", "Override region")

Error Handling

Providers should handle errors gracefully:

func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
    // Set timeout for metadata calls
    ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
    defer cancel()
    
    // Try primary method
    clusterName, err := p.getClusterNameFromTags(ctx)
    if err == nil {
        return clusterName, nil
    }
    
    // Log but try fallback
    p.logger.V(1).Info("Failed to get cluster name from tags", "error", err)
    
    // Try secondary method
    clusterName, err = p.getClusterNameFromEndpoint()
    if err == nil {
        return clusterName, nil
    }
    
    // Return error with context
    return "", fmt.Errorf("failed to determine cluster name: %w", err)
}

Testing Providers

Unit Tests

func TestEKSProvider_Region(t *testing.T) {
    // Mock IMDS client
    mockIMDS := &mockIMDSClient{
        region: "us-west-2",
    }
    
    provider := &EKSProvider{
        imdsClient: mockIMDS,
        logger:     logr.Discard(),
    }
    
    region, err := provider.Region(context.Background())
    assert.NoError(t, err)
    assert.Equal(t, "us-west-2", region)
}

Integration Tests

func TestProviderDetection(t *testing.T) {
    if testing.Short() {
        t.Skip("Skipping integration test")
    }
    
    // Test in real environment
    provider, err := DetectProvider(context.Background(), logr.Discard())
    require.NoError(t, err)
    
    // Verify provider works
    ctx := context.Background()
    
    name := provider.Name()
    assert.NotEmpty(t, name)
    
    clusterName, err := provider.ClusterName(ctx)
    assert.NoError(t, err)
    assert.NotEmpty(t, clusterName)
    
    region, err := provider.Region(ctx)
    assert.NoError(t, err)
    assert.NotEmpty(t, region)
}

Monitoring

Metrics

var (
    providerDetections = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "cloud_provider_detections_total",
            Help: "Cloud provider detection attempts",
        },
        []string{"provider", "success"},
    )
    
    metadataCallDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "cloud_provider_metadata_duration_seconds",
            Help: "Duration of metadata service calls",
        },
        []string{"provider", "method"},
    )
)

Logging

func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
    p.logger.V(1).Info("Discovering EKS cluster name")
    
    start := time.Now()
    clusterName, err := p.discoverClusterName(ctx)
    duration := time.Since(start)
    
    if err != nil {
        p.logger.Error(err, "Failed to discover cluster name", 
            "duration", duration)
        return "", err
    }
    
    p.logger.Info("Successfully discovered cluster name",
        "cluster", clusterName,
        "duration", duration)
    
    return clusterName, nil
}

Best Practices

  1. Fail Fast: Set reasonable timeouts for metadata calls
  2. Cache Results: Provider metadata rarely changes
  3. Graceful Fallback: Try multiple discovery methods
  4. Clear Errors: Provide context in error messages
  5. Test Coverage: Mock external dependencies

Future Providers

GKE (Google Kubernetes Engine)

type GKEProvider struct {
    metadataClient *metadata.Client
    logger         logr.Logger
}

func (p *GKEProvider) ClusterName(ctx context.Context) (string, error) {
    // Get from GCE metadata
    return p.metadataClient.InstanceAttributeValue("cluster-name")
}

func (p *GKEProvider) Region(ctx context.Context) (string, error) {
    // Parse from zone
    zone, err := p.metadataClient.Zone()
    if err != nil {
        return "", err
    }
    // Extract region from zone (us-central1-a -> us-central1)
    parts := strings.Split(zone, "-")
    if len(parts) >= 2 {
        return strings.Join(parts[:2], "-"), nil
    }
    return "", fmt.Errorf("invalid zone format: %s", zone)
}

AKS (Azure Kubernetes Service)

type AKSProvider struct {
    metadataClient *azidentity.DefaultAzureCredential
    logger         logr.Logger
}

func (p *AKSProvider) ClusterName(ctx context.Context) (string, error) {
    // Get from Azure IMDS
    req, _ := http.NewRequest("GET", 
        "http://169.254.169.254/metadata/instance/compute/tags?api-version=2021-02-01", 
        nil)
    req.Header.Add("Metadata", "true")
    
    // Parse response for cluster tag
    // ...
}

Provider Comparison

For a detailed feature comparison between cloud providers, see Cloud Providers Comparison.

Next Steps


Source code: internal/kubernetes/cluster/

⚠️ **GitHub.com Fallback** ⚠️