Cloud Providers - antimetal/system-agent GitHub Wiki
The Cloud Provider abstraction allows the System Agent to discover and report cloud-specific metadata about the Kubernetes cluster. This extensible interface supports multiple cloud platforms and local development environments.
The core interface is minimal and focused:
type Provider interface {
// Name returns the provider identifier (e.g., "eks", "gke", "kind")
Name() string
// ClusterName discovers the cluster's name in the cloud provider
ClusterName(ctx context.Context) (string, error)
// Region returns the cloud region where the cluster runs
Region(ctx context.Context) (string, error)
}
graph TD
subgraph PF["Provider Factory"]
AD["Auto Detect"] --> PR["Provider Registry"]
PR --> PI["Provider Instance"]
subgraph AP["Available Providers"]
EKS["EKS<br/>(AWS)"]
GKE["GKE<br/>(Google)"]
AKS["AKS<br/>(Azure)"]
KIND["KIND<br/>(Local)"]
end
PR -.-> EKS
PR -.-> GKE
PR -.-> AKS
PR -.-> KIND
end
Full implementation with AWS integration:
type EKSProvider struct {
ec2Client EC2DescribeInstancesAPI
logger logr.Logger
}
func NewEKSProvider(logger logr.Logger) (*EKSProvider, error) {
cfg, err := config.LoadDefaultConfig(context.Background())
if err != nil {
return nil, fmt.Errorf("failed to load AWS config: %w", err)
}
return &EKSProvider{
ec2Client: ec2.NewFromConfig(cfg),
logger: logger,
}, nil
}
func (p *EKSProvider) Name() string {
return "eks"
}
func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
// Try multiple discovery methods
// 1. Check CLUSTER_NAME environment variable
if clusterName := os.Getenv("CLUSTER_NAME"); clusterName != "" {
return clusterName, nil
}
// 2. Parse from kubeconfig cluster endpoint
config, err := rest.InClusterConfig()
if err == nil {
// Extract from endpoint: https://xxx.region.eks.amazonaws.com
if clusterName := parseEKSEndpoint(config.Host); clusterName != "" {
return clusterName, nil
}
}
// 3. Query EC2 tags on the node
instanceID, err := p.getInstanceID()
if err != nil {
return "", err
}
tags, err := p.getInstanceTags(ctx, instanceID)
if err != nil {
return "", err
}
// Look for standard EKS cluster tag
for k, v := range tags {
if strings.HasPrefix(k, "kubernetes.io/cluster/") {
parts := strings.Split(k, "/")
if len(parts) >= 3 {
return parts[2], nil
}
}
}
return "", fmt.Errorf("cluster name not found")
}
func (p *EKSProvider) Region(ctx context.Context) (string, error) {
// Get from EC2 metadata service
client := imds.NewFromConfig(aws.Config{})
output, err := client.GetRegion(ctx, &imds.GetRegionInput{})
if err != nil {
return "", fmt.Errorf("failed to get region from IMDS: %w", err)
}
return output.Region, nil
}
Local development provider:
type KindProvider struct {
logger logr.Logger
}
func NewKindProvider(logger logr.Logger) *KindProvider {
return &KindProvider{logger: logger}
}
func (p *KindProvider) Name() string {
return "kind"
}
func (p *KindProvider) ClusterName(ctx context.Context) (string, error) {
// KIND sets the cluster name in kubeconfig context
loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
configOverrides := &clientcmd.ConfigOverrides{}
kubeConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
loadingRules,
configOverrides,
)
config, err := kubeConfig.RawConfig()
if err != nil {
return "", err
}
// KIND contexts are named "kind-{cluster-name}"
currentContext := config.CurrentContext
if strings.HasPrefix(currentContext, "kind-") {
return strings.TrimPrefix(currentContext, "kind-"), nil
}
// Default KIND cluster name
return "kind", nil
}
func (p *KindProvider) Region(ctx context.Context) (string, error) {
// KIND is local, no region
return "local", nil
}
Fallback for unknown environments:
type GenericProvider struct {
clusterName string
logger logr.Logger
}
func NewGenericProvider(clusterName string, logger logr.Logger) *GenericProvider {
if clusterName == "" {
clusterName = "unknown"
}
return &GenericProvider{
clusterName: clusterName,
logger: logger,
}
}
func (p *GenericProvider) Name() string {
return "generic"
}
func (p *GenericProvider) ClusterName(ctx context.Context) (string, error) {
return p.clusterName, nil
}
func (p *GenericProvider) Region(ctx context.Context) (string, error) {
return "unknown", nil
}
The provider factory automatically detects the cloud environment:
func DetectProvider(ctx context.Context, logger logr.Logger) (Provider, error) {
// 1. Check explicit provider override
if providerName := os.Getenv("CLOUD_PROVIDER"); providerName != "" {
return createProvider(providerName, logger)
}
// 2. Try to detect EKS
if isEKS() {
provider, err := NewEKSProvider(logger)
if err == nil {
return provider, nil
}
logger.Error(err, "Failed to create EKS provider")
}
// 3. Check for KIND
if isKind() {
return NewKindProvider(logger), nil
}
// 4. Check for GKE (future)
if isGKE() {
return NewGKEProvider(logger), nil
}
// 5. Check for AKS (future)
if isAKS() {
return NewAKSProvider(logger), nil
}
// 6. Fallback to generic
logger.Info("No specific cloud provider detected, using generic")
return NewGenericProvider("", logger), nil
}
// Detection functions
func isEKS() bool {
// Check for EKS-specific indicators
if _, err := os.Stat("/var/run/secrets/eks.amazonaws.com"); err == nil {
return true
}
// Check if running on EC2
client := imds.New(imds.Options{})
_, err := client.GetInstanceIdentityDocument(context.Background(), nil)
return err == nil
}
func isKind() bool {
// KIND sets specific environment variables
return os.Getenv("KUBERNETES_SERVICE_HOST") == "kind-control-plane" ||
strings.Contains(os.Getenv("KUBERNETES_SERVICE_HOST"), ".kind")
}
func isGKE() bool {
// Check for GKE metadata server
resp, err := http.Get("http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name")
if err == nil {
resp.Body.Close()
return resp.StatusCode == 200
}
return false
}
func isAKS() bool {
// Check for Azure metadata service
req, _ := http.NewRequest("GET", "http://169.254.169.254/metadata/instance?api-version=2021-02-01", nil)
req.Header.Add("Metadata", "true")
client := &http.Client{Timeout: 2 * time.Second}
resp, err := client.Do(req)
if err == nil {
resp.Body.Close()
return resp.StatusCode == 200
}
return false
}
package providers
import (
"context"
"github.com/go-logr/logr"
)
type MyCloudProvider struct {
logger logr.Logger
// Add cloud-specific clients
}
func NewMyCloudProvider(logger logr.Logger) (*MyCloudProvider, error) {
// Initialize cloud SDK clients
return &MyCloudProvider{
logger: logger,
}, nil
}
func (p *MyCloudProvider) Name() string {
return "mycloud"
}
func (p *MyCloudProvider) ClusterName(ctx context.Context) (string, error) {
// Implement cluster name discovery
// 1. Check environment variables
// 2. Query cloud metadata service
// 3. Parse from node labels/annotations
return "", nil
}
func (p *MyCloudProvider) Region(ctx context.Context) (string, error) {
// Implement region discovery
// Usually from metadata service
return "", nil
}
func isMyCloud() bool {
// Add detection logic
// Examples:
// - Check for specific files/directories
// - Try to reach metadata service
// - Check environment variables
// - Examine node labels
return false
}
func DetectProvider(ctx context.Context, logger logr.Logger) (Provider, error) {
// ... existing detection logic ...
// Add your provider
if isMyCloud() {
provider, err := NewMyCloudProvider(logger)
if err == nil {
return provider, nil
}
logger.Error(err, "Failed to create MyCloud provider")
}
// ... fallback logic ...
}
func TestMyCloudProvider_ClusterName(t *testing.T) {
tests := []struct {
name string
setup func()
want string
wantErr bool
}{
{
name: "from environment variable",
setup: func() {
os.Setenv("CLUSTER_NAME", "test-cluster")
},
want: "test-cluster",
},
{
name: "from metadata service",
setup: func() {
// Mock metadata service
},
want: "prod-cluster",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.setup()
provider := &MyCloudProvider{logger: logr.Discard()}
got, err := provider.ClusterName(context.Background())
if (err != nil) != tt.wantErr {
t.Errorf("ClusterName() error = %v, wantErr %v", err, tt.wantErr)
}
if got != tt.want {
t.Errorf("ClusterName() = %v, want %v", got, tt.want)
}
})
}
}
Providers can expose additional metadata:
// Extended provider interface (optional)
type ExtendedProvider interface {
Provider
// Additional cloud-specific metadata
InstanceType(ctx context.Context) (string, error)
AvailabilityZone(ctx context.Context) (string, error)
AccountID(ctx context.Context) (string, error)
}
// EKS implementation
func (p *EKSProvider) InstanceType(ctx context.Context) (string, error) {
doc, err := p.getInstanceIdentityDocument(ctx)
if err != nil {
return "", err
}
return doc.InstanceType, nil
}
func (p *EKSProvider) AvailabilityZone(ctx context.Context) (string, error) {
doc, err := p.getInstanceIdentityDocument(ctx)
if err != nil {
return "", err
}
return doc.AvailabilityZone, nil
}
func (p *EKSProvider) AccountID(ctx context.Context) (string, error) {
doc, err := p.getInstanceIdentityDocument(ctx)
if err != nil {
return "", err
}
return doc.AccountID, nil
}
Variable | Description | Example |
---|---|---|
CLOUD_PROVIDER |
Force specific provider |
eks , gke , kind
|
CLUSTER_NAME |
Override cluster name | prod-cluster |
AWS_REGION |
AWS region (EKS) | us-west-2 |
flags.String("cloud-provider", "", "Cloud provider (auto-detected if empty)")
flags.String("cluster-name", "", "Override cluster name")
flags.String("region", "", "Override region")
Providers should handle errors gracefully:
func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
// Set timeout for metadata calls
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
// Try primary method
clusterName, err := p.getClusterNameFromTags(ctx)
if err == nil {
return clusterName, nil
}
// Log but try fallback
p.logger.V(1).Info("Failed to get cluster name from tags", "error", err)
// Try secondary method
clusterName, err = p.getClusterNameFromEndpoint()
if err == nil {
return clusterName, nil
}
// Return error with context
return "", fmt.Errorf("failed to determine cluster name: %w", err)
}
func TestEKSProvider_Region(t *testing.T) {
// Mock IMDS client
mockIMDS := &mockIMDSClient{
region: "us-west-2",
}
provider := &EKSProvider{
imdsClient: mockIMDS,
logger: logr.Discard(),
}
region, err := provider.Region(context.Background())
assert.NoError(t, err)
assert.Equal(t, "us-west-2", region)
}
func TestProviderDetection(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test")
}
// Test in real environment
provider, err := DetectProvider(context.Background(), logr.Discard())
require.NoError(t, err)
// Verify provider works
ctx := context.Background()
name := provider.Name()
assert.NotEmpty(t, name)
clusterName, err := provider.ClusterName(ctx)
assert.NoError(t, err)
assert.NotEmpty(t, clusterName)
region, err := provider.Region(ctx)
assert.NoError(t, err)
assert.NotEmpty(t, region)
}
var (
providerDetections = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cloud_provider_detections_total",
Help: "Cloud provider detection attempts",
},
[]string{"provider", "success"},
)
metadataCallDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "cloud_provider_metadata_duration_seconds",
Help: "Duration of metadata service calls",
},
[]string{"provider", "method"},
)
)
func (p *EKSProvider) ClusterName(ctx context.Context) (string, error) {
p.logger.V(1).Info("Discovering EKS cluster name")
start := time.Now()
clusterName, err := p.discoverClusterName(ctx)
duration := time.Since(start)
if err != nil {
p.logger.Error(err, "Failed to discover cluster name",
"duration", duration)
return "", err
}
p.logger.Info("Successfully discovered cluster name",
"cluster", clusterName,
"duration", duration)
return clusterName, nil
}
- Fail Fast: Set reasonable timeouts for metadata calls
- Cache Results: Provider metadata rarely changes
- Graceful Fallback: Try multiple discovery methods
- Clear Errors: Provide context in error messages
- Test Coverage: Mock external dependencies
type GKEProvider struct {
metadataClient *metadata.Client
logger logr.Logger
}
func (p *GKEProvider) ClusterName(ctx context.Context) (string, error) {
// Get from GCE metadata
return p.metadataClient.InstanceAttributeValue("cluster-name")
}
func (p *GKEProvider) Region(ctx context.Context) (string, error) {
// Parse from zone
zone, err := p.metadataClient.Zone()
if err != nil {
return "", err
}
// Extract region from zone (us-central1-a -> us-central1)
parts := strings.Split(zone, "-")
if len(parts) >= 2 {
return strings.Join(parts[:2], "-"), nil
}
return "", fmt.Errorf("invalid zone format: %s", zone)
}
type AKSProvider struct {
metadataClient *azidentity.DefaultAzureCredential
logger logr.Logger
}
func (p *AKSProvider) ClusterName(ctx context.Context) (string, error) {
// Get from Azure IMDS
req, _ := http.NewRequest("GET",
"http://169.254.169.254/metadata/instance/compute/tags?api-version=2021-02-01",
nil)
req.Header.Add("Metadata", "true")
// Parse response for cluster tag
// ...
}
For a detailed feature comparison between cloud providers, see Cloud Providers Comparison.
- Architecture Overview - System design
- Configuration Guide - All configuration options
- Custom Collectors - Building extensible components
Source code: internal/kubernetes/cluster/