diff --git a/pkg/providers/v1/instances_v2.go b/pkg/providers/v1/instances_v2.go index f7a08a2d16..8dbd7649ba 100644 --- a/pkg/providers/v1/instances_v2.go +++ b/pkg/providers/v1/instances_v2.go @@ -27,6 +27,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog/v2" ) func (c *Cloud) getProviderID(ctx context.Context, node *v1.Node) (string, error) { @@ -82,8 +83,11 @@ func (c *Cloud) getAdditionalLabels(ctx context.Context, zoneName string, instan // If topology labels are already set, skip. if _, ok := existingLabels[LabelNetworkNodePrefix+"1"]; !ok { nodeTopology, err := c.instanceTopologyManager.GetNodeTopology(ctx, instanceType, region, instanceID) + // We've seen some edge cases where this functionality is problematic, so swallowing errors and logging + // to avoid short-circuiting syncing nodes. If it's an intermittent issue, the labels will be added + // on subsequent attempts. if err != nil { - return nil, err + klog.Warningf("Failed to get node topology. Moving on without setting labels: %q", err) } else if nodeTopology != nil { for index, networkNode := range nodeTopology.NetworkNodes { layer := index + 1 diff --git a/pkg/providers/v1/instances_v2_test.go b/pkg/providers/v1/instances_v2_test.go index ecbf1b8bbd..f0fae9bf0f 100644 --- a/pkg/providers/v1/instances_v2_test.go +++ b/pkg/providers/v1/instances_v2_test.go @@ -30,6 +30,7 @@ import ( "github.com/stretchr/testify/mock" v1 "k8s.io/api/core/v1" "k8s.io/cloud-provider-aws/pkg/resourcemanagers" + "k8s.io/cloud-provider-aws/pkg/services" ) func TestGetProviderId(t *testing.T) { @@ -236,6 +237,30 @@ func TestInstanceMetadata(t *testing.T) { // Validate that labels are unchanged. assert.Equal(t, map[string]string{}, result.AdditionalLabels) }) + + t.Run("Should swallow errors if getting node topology fails", func(t *testing.T) { + instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) + var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager + c.instanceTopologyManager = &mockedTopologyManager + mockedTopologyManager.On("GetNodeTopology", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil, + services.NewMockAPIError("InvalidParameterValue", "Nope.")) + node := &v1.Node{ + Spec: v1.NodeSpec{ + ProviderID: fmt.Sprintf("aws:///us-west-2c/1abc-2def/%s", *instance.InstanceId), + }, + } + + result, err := c.InstanceMetadata(context.TODO(), node) + if err != nil { + t.Errorf("Should not error getting InstanceMetadata: %s", err) + } + + mockedTopologyManager.AssertNumberOfCalls(t, "GetNodeTopology", 1) + assert.Equal(t, map[string]string{ + LabelZoneID: "az1", + }, result.AdditionalLabels) + }) } func getCloudWithMockedDescribeInstances(instanceExists bool, instanceState string) *Cloud { diff --git a/pkg/resourcemanagers/topology.go b/pkg/resourcemanagers/topology.go index 0f5cfd37ca..2682644bc6 100644 --- a/pkg/resourcemanagers/topology.go +++ b/pkg/resourcemanagers/topology.go @@ -114,6 +114,11 @@ func (t *instanceTopologyManager) addUnsupported(key string) { } func (t *instanceTopologyManager) mightSupportTopology(instanceType string, region string) bool { + // In the case of fargate and possibly other variants, the instance type will be empty. + if len(instanceType) == 0 { + return false + } + if _, exists, err := t.unsupportedKeyStore.GetByKey(region); exists { return false } else if err != nil { diff --git a/pkg/resourcemanagers/topology_test.go b/pkg/resourcemanagers/topology_test.go index d889a3d420..976452b0f1 100644 --- a/pkg/resourcemanagers/topology_test.go +++ b/pkg/resourcemanagers/topology_test.go @@ -26,6 +26,21 @@ import ( ) func TestGetNodeTopology(t *testing.T) { + t.Run("Should skip nodes that don't have instance type set", func(t *testing.T) { + mockedEc2SdkV2 := services.MockedEc2SdkV2{} + topologyManager := NewInstanceTopologyManager(&mockedEc2SdkV2) + // Loop multiple times to check cache use + topology, err := topologyManager.GetNodeTopology(context.TODO(), "" /* empty instance type */, "some-region", "some-id") + if err != nil { + t.Errorf("Should not error getting node topology: %s", err) + } + if topology != nil { + t.Errorf("Should not be returning a topology: %v", topology) + } + + mockedEc2SdkV2.AssertNumberOfCalls(t, "DescribeInstanceTopology", 0) + }) + t.Run("Should handle unsupported regions and utilize cache", func(t *testing.T) { mockedEc2SdkV2 := services.MockedEc2SdkV2{} topologyManager := NewInstanceTopologyManager(&mockedEc2SdkV2)