Skip to content

Commit

Permalink
gce: improve ephemeral fallback NIC selection (CPC-2578) (#4163)
Browse files Browse the repository at this point in the history
While setting up the ephemeral network config for fetching the IMD at
init-local timeframe on multi-NIC instances, fulfilling:

- All NICs without carrier flag or more than one NIC with carrier flag
- systemd's predictable interface names is enabled

net.find_fallback_nic could select a NIC that is not the primary NIC,
leaving the instance without access to the network, as on GCE, only the
primary NIC can talk to the IMDS.

At this point in time, there is yet not access to
{instance,vendor,user}-data. Thus, it is tricky to dynamically inject a
breadcrum pointing to the primary NIC.

Two actions have been taken to fix this situation on the
DataSourceGCE:

1. Substitute eth0 with ens4 as the default primary NIC candidate.
2. Try through the list of candidate NICs and use the first that can
reach the IMDS.
  • Loading branch information
aciba90 committed Jun 14, 2023
1 parent d1268cd commit df2a053
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 9 deletions.
54 changes: 46 additions & 8 deletions cloudinit/sources/DataSourceGCE.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import datetime
import json
from base64 import b64decode
from contextlib import suppress as noop

from cloudinit import dmi
from cloudinit import log as logging
from cloudinit import sources, url_helper, util
from cloudinit import net, sources, url_helper, util
from cloudinit.distros import ug_util
from cloudinit.event import EventScope, EventType
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.net.ephemeral import EphemeralDHCPv4
from cloudinit.sources import DataSourceHostname

Expand All @@ -26,6 +26,7 @@
)
HOSTKEY_NAMESPACE = "hostkeys"
HEADERS = {"Metadata-Flavor": "Google"}
DEFAULT_PRIMARY_INTERFACE = "ens4"


class GoogleMetadataFetcher:
Expand Down Expand Up @@ -88,13 +89,50 @@ def __init__(self, sys_cfg, distro, paths):

def _get_data(self):
url_params = self.get_url_params()
network_context = noop()
if self.perform_dhcp_setup:
network_context = EphemeralDHCPv4(
self.distro,
self.fallback_interface,
)
with network_context:
candidate_nics = net.find_candidate_nics()
if DEFAULT_PRIMARY_INTERFACE in candidate_nics:
candidate_nics.remove(DEFAULT_PRIMARY_INTERFACE)
candidate_nics.insert(0, DEFAULT_PRIMARY_INTERFACE)
LOG.debug("Looking for the primary NIC in: %s", candidate_nics)
assert (
len(candidate_nics) >= 1
), "The instance has to have at least one candidate NIC"
for candidate_nic in candidate_nics:
network_context = EphemeralDHCPv4(
self.distro,
iface=candidate_nic,
)
try:
with network_context:
try:
ret = util.log_time(
LOG.debug,
"Crawl of GCE metadata service",
read_md,
kwargs={
"address": self.metadata_address,
"url_params": url_params,
},
)
except Exception as e:
LOG.debug(
"Error fetching IMD with candidate NIC %s: %s",
candidate_nic,
e,
)
continue
except NoDHCPLeaseError:
continue
if ret["success"]:
self._fallback_interface = candidate_nic
LOG.debug("Primary NIC found: %s.", candidate_nic)
break
if self._fallback_interface is None:
LOG.warning(
"Did not find a fallback interface on %s.", self.cloud_name
)
else:
ret = util.log_time(
LOG.debug,
"Crawl of GCE metadata service",
Expand Down
70 changes: 69 additions & 1 deletion tests/unittests/sources/test_gce.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import responses

from cloudinit import distros, helpers, settings
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.sources import DataSourceGCE
from tests.unittests import helpers as test_helpers

Expand Down Expand Up @@ -61,6 +62,8 @@


class TestDataSourceGCE(test_helpers.ResponsesTestCase):
with_logs = True

def _make_distro(self, dtype, def_user=None):
cfg = dict(settings.CFG_BUILTIN)
cfg["system_info"]["distro"] = dtype
Expand Down Expand Up @@ -401,8 +404,11 @@ def test_publish_host_keys(self, m_readurl):
M_PATH + "EphemeralDHCPv4",
autospec=True,
)
@mock.patch(M_PATH + "net.find_candidate_nics", return_value=["ens4"])
@mock.patch(M_PATH + "DataSourceGCELocal.fallback_interface")
def test_local_datasource_uses_ephemeral_dhcp(self, _m_fallback, m_dhcp):
def test_local_datasource_uses_ephemeral_dhcp(
self, _m_fallback, _m_find_candidate_nics, m_dhcp
):
self._set_mock_metadata()
distro = mock.MagicMock()
distro.get_tmp_exec_path = self.tmp_dir
Expand All @@ -412,6 +418,68 @@ def test_local_datasource_uses_ephemeral_dhcp(self, _m_fallback, m_dhcp):
ds._get_data()
assert m_dhcp.call_count == 1

@mock.patch(M_PATH + "util.log_time")
@mock.patch(
M_PATH + "EphemeralDHCPv4",
autospec=True,
)
@mock.patch(M_PATH + "net.find_candidate_nics")
@mock.patch(M_PATH + "DataSourceGCELocal.fallback_interface")
def test_local_datasource_tries_on_multi_nic(
self, _m_fallback, m_find_candidate_nics, m_dhcp, m_read_md
):
self._set_mock_metadata()
distro = mock.MagicMock()
distro.get_tmp_exec_path = self.tmp_dir
ds = DataSourceGCE.DataSourceGCELocal(
sys_cfg={}, distro=distro, paths=None
)
m_find_candidate_nics.return_value = [
"ens0p4",
"ens0p5",
"ens0p6",
"ens4",
]
m_dhcp.return_value.__enter__.side_effect = (
None,
NoDHCPLeaseError,
None,
None,
)
m_read_md.side_effect = (
{"success": False},
Exception("whoopsie, not this one"),
{
"success": True,
"platform_reports_gce": True,
"reason": "reason",
"user-data": "ud",
"meta-data": "md",
},
)
assert ds._get_data() is True
assert m_dhcp.call_args_list == [
mock.call(distro, iface=DataSourceGCE.DEFAULT_PRIMARY_INTERFACE),
mock.call(distro, iface="ens0p4"),
mock.call(distro, iface="ens0p5"),
mock.call(distro, iface="ens0p6"),
]
assert ds._fallback_interface == "ens0p6"
assert ds.metadata == "md"
assert ds.userdata_raw == "ud"

expected_logs = (
"Looking for the primary NIC in:"
" ['ens4', 'ens0p4', 'ens0p5', 'ens0p6']",
"Error fetching IMD with candidate NIC ens0p5:"
" whoopsie, not this one",
)
for msg in expected_logs:
self.assertIn(
msg,
self.logs.getvalue(),
)

@mock.patch(
M_PATH + "EphemeralDHCPv4",
autospec=True,
Expand Down

0 comments on commit df2a053

Please sign in to comment.