From df2a053befc9639b9a2fd0ce5c9c832ff35357ab Mon Sep 17 00:00:00 2001 From: Alberto Contreras Date: Wed, 14 Jun 2023 08:44:54 +0200 Subject: [PATCH] gce: improve ephemeral fallback NIC selection (CPC-2578) (#4163) While setting up the ephemeral network config for fetching the IMD at init-local timeframe on multi-NIC instances, fulfilling: - All NICs without carrier flag or more than one NIC with carrier flag - systemd's predictable interface names is enabled net.find_fallback_nic could select a NIC that is not the primary NIC, leaving the instance without access to the network, as on GCE, only the primary NIC can talk to the IMDS. At this point in time, there is yet not access to {instance,vendor,user}-data. Thus, it is tricky to dynamically inject a breadcrum pointing to the primary NIC. Two actions have been taken to fix this situation on the DataSourceGCE: 1. Substitute eth0 with ens4 as the default primary NIC candidate. 2. Try through the list of candidate NICs and use the first that can reach the IMDS. --- cloudinit/sources/DataSourceGCE.py | 54 ++++++++++++++++++---- tests/unittests/sources/test_gce.py | 70 ++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/cloudinit/sources/DataSourceGCE.py b/cloudinit/sources/DataSourceGCE.py index 27d6089ad46..2fc4186bf48 100644 --- a/cloudinit/sources/DataSourceGCE.py +++ b/cloudinit/sources/DataSourceGCE.py @@ -5,13 +5,13 @@ import datetime import json from base64 import b64decode -from contextlib import suppress as noop from cloudinit import dmi from cloudinit import log as logging -from cloudinit import sources, url_helper, util +from cloudinit import net, sources, url_helper, util from cloudinit.distros import ug_util from cloudinit.event import EventScope, EventType +from cloudinit.net.dhcp import NoDHCPLeaseError from cloudinit.net.ephemeral import EphemeralDHCPv4 from cloudinit.sources import DataSourceHostname @@ -26,6 +26,7 @@ ) HOSTKEY_NAMESPACE = "hostkeys" HEADERS = {"Metadata-Flavor": "Google"} +DEFAULT_PRIMARY_INTERFACE = "ens4" class GoogleMetadataFetcher: @@ -88,13 +89,50 @@ def __init__(self, sys_cfg, distro, paths): def _get_data(self): url_params = self.get_url_params() - network_context = noop() if self.perform_dhcp_setup: - network_context = EphemeralDHCPv4( - self.distro, - self.fallback_interface, - ) - with network_context: + candidate_nics = net.find_candidate_nics() + if DEFAULT_PRIMARY_INTERFACE in candidate_nics: + candidate_nics.remove(DEFAULT_PRIMARY_INTERFACE) + candidate_nics.insert(0, DEFAULT_PRIMARY_INTERFACE) + LOG.debug("Looking for the primary NIC in: %s", candidate_nics) + assert ( + len(candidate_nics) >= 1 + ), "The instance has to have at least one candidate NIC" + for candidate_nic in candidate_nics: + network_context = EphemeralDHCPv4( + self.distro, + iface=candidate_nic, + ) + try: + with network_context: + try: + ret = util.log_time( + LOG.debug, + "Crawl of GCE metadata service", + read_md, + kwargs={ + "address": self.metadata_address, + "url_params": url_params, + }, + ) + except Exception as e: + LOG.debug( + "Error fetching IMD with candidate NIC %s: %s", + candidate_nic, + e, + ) + continue + except NoDHCPLeaseError: + continue + if ret["success"]: + self._fallback_interface = candidate_nic + LOG.debug("Primary NIC found: %s.", candidate_nic) + break + if self._fallback_interface is None: + LOG.warning( + "Did not find a fallback interface on %s.", self.cloud_name + ) + else: ret = util.log_time( LOG.debug, "Crawl of GCE metadata service", diff --git a/tests/unittests/sources/test_gce.py b/tests/unittests/sources/test_gce.py index 1a89563c300..6c17fb2ffe2 100644 --- a/tests/unittests/sources/test_gce.py +++ b/tests/unittests/sources/test_gce.py @@ -14,6 +14,7 @@ import responses from cloudinit import distros, helpers, settings +from cloudinit.net.dhcp import NoDHCPLeaseError from cloudinit.sources import DataSourceGCE from tests.unittests import helpers as test_helpers @@ -61,6 +62,8 @@ class TestDataSourceGCE(test_helpers.ResponsesTestCase): + with_logs = True + def _make_distro(self, dtype, def_user=None): cfg = dict(settings.CFG_BUILTIN) cfg["system_info"]["distro"] = dtype @@ -401,8 +404,11 @@ def test_publish_host_keys(self, m_readurl): M_PATH + "EphemeralDHCPv4", autospec=True, ) + @mock.patch(M_PATH + "net.find_candidate_nics", return_value=["ens4"]) @mock.patch(M_PATH + "DataSourceGCELocal.fallback_interface") - def test_local_datasource_uses_ephemeral_dhcp(self, _m_fallback, m_dhcp): + def test_local_datasource_uses_ephemeral_dhcp( + self, _m_fallback, _m_find_candidate_nics, m_dhcp + ): self._set_mock_metadata() distro = mock.MagicMock() distro.get_tmp_exec_path = self.tmp_dir @@ -412,6 +418,68 @@ def test_local_datasource_uses_ephemeral_dhcp(self, _m_fallback, m_dhcp): ds._get_data() assert m_dhcp.call_count == 1 + @mock.patch(M_PATH + "util.log_time") + @mock.patch( + M_PATH + "EphemeralDHCPv4", + autospec=True, + ) + @mock.patch(M_PATH + "net.find_candidate_nics") + @mock.patch(M_PATH + "DataSourceGCELocal.fallback_interface") + def test_local_datasource_tries_on_multi_nic( + self, _m_fallback, m_find_candidate_nics, m_dhcp, m_read_md + ): + self._set_mock_metadata() + distro = mock.MagicMock() + distro.get_tmp_exec_path = self.tmp_dir + ds = DataSourceGCE.DataSourceGCELocal( + sys_cfg={}, distro=distro, paths=None + ) + m_find_candidate_nics.return_value = [ + "ens0p4", + "ens0p5", + "ens0p6", + "ens4", + ] + m_dhcp.return_value.__enter__.side_effect = ( + None, + NoDHCPLeaseError, + None, + None, + ) + m_read_md.side_effect = ( + {"success": False}, + Exception("whoopsie, not this one"), + { + "success": True, + "platform_reports_gce": True, + "reason": "reason", + "user-data": "ud", + "meta-data": "md", + }, + ) + assert ds._get_data() is True + assert m_dhcp.call_args_list == [ + mock.call(distro, iface=DataSourceGCE.DEFAULT_PRIMARY_INTERFACE), + mock.call(distro, iface="ens0p4"), + mock.call(distro, iface="ens0p5"), + mock.call(distro, iface="ens0p6"), + ] + assert ds._fallback_interface == "ens0p6" + assert ds.metadata == "md" + assert ds.userdata_raw == "ud" + + expected_logs = ( + "Looking for the primary NIC in:" + " ['ens4', 'ens0p4', 'ens0p5', 'ens0p6']", + "Error fetching IMD with candidate NIC ens0p5:" + " whoopsie, not this one", + ) + for msg in expected_logs: + self.assertIn( + msg, + self.logs.getvalue(), + ) + @mock.patch( M_PATH + "EphemeralDHCPv4", autospec=True,