Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cgroup rewrite: uses systemctl for expressing desired configuration instead drop-in files #3269

Merged
merged 11 commits into from
Dec 17, 2024
Merged
2 changes: 2 additions & 0 deletions azurelinuxagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ def collect_logs(self, is_full_mode):

log_collector_cgroup = cgroup_api.get_process_cgroup(process_id="self", cgroup_name=AGENT_LOG_COLLECTOR)
tracked_controllers = log_collector_cgroup.get_controllers()
for controller in tracked_controllers:
logger.info("{0} controller for cgroup: {1}".format(controller.get_controller_type(), controller))

if len(tracked_controllers) != len(log_collector_cgroup.get_supported_controller_names()):
event.warn(WALAEventOperation.LogCollection, "At least one required controller is missing. The following controllers are required for the log collector to run: {0}", log_collector_cgroup.get_supported_controller_names())
Expand Down
36 changes: 36 additions & 0 deletions azurelinuxagent/common/osutil/systemd.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,39 @@ def get_unit_property(unit_name, property_name):
raise ValueError("Can't find property {0} of {1}".format(property_name, unit_name))
return match.group('value')


def set_unit_property_run_time(unit_name, property_name, value):
"""
Set a property of a unit at runtime

Note: --runtime settings only apply until the next reboot
"""
try:
# Ex: systemctl set-property foobar.service CPUWeight=200 --runtime
shellutil.run_command(["systemctl", "set-property", unit_name, "{0}={1}".format(property_name, value), "--runtime"])
except shellutil.CommandError as e:
raise ValueError("Can't set property {0} of {1}: {2}".format(property_name, unit_name, e))


def set_unit_properties_run_time(unit_name, properties):
"""
Set multiple properties of a unit at runtime

Note: --runtime settings only apply until the next reboot
"""
try:
# Ex: systemctl set-property foobar.service CPUWeight=200 MemoryMax=2G IPAccounting=yes --runtime
shellutil.run_command(["systemctl", "set-property", unit_name] + properties + ["--runtime"])
except shellutil.CommandError as e:
raise ValueError("Can't set properties {0} of {1}: {2}".format(properties, unit_name, e))


def is_unit_loaded(unit_name):
"""
Determine if a unit is loaded
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does it mean for a unit to be loaded

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unit being loaded means unit is parsed and available to the systemd process. We check this for extension services when we set quotas. If systemd is unaware of extension services and not loaded in the system yet, we get error while setting quotas. Hence, I added loaded check.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If systemd is unaware of extension services and not loaded in the system yet, we get error while setting quotas. Hence, I added loaded check.

can you add this as a comment where you call this function?

"""
try:
value = get_unit_property(unit_name, "LoadState")
return value.lower() == "loaded"
except shellutil.CommandError:
return False
6 changes: 1 addition & 5 deletions azurelinuxagent/ga/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class CGroupUtil(object):
Cgroup utility methods which are independent of systemd cgroup api.
"""
@staticmethod
def cgroups_supported():
def distro_supported():
distro_info = get_distro()
distro_name = distro_info[0]
try:
Expand Down Expand Up @@ -650,8 +650,6 @@ def get_controllers(self, expected_relative_path=None):
controller = MemoryControllerV1(self._cgroup_name, controller_path)

if controller is not None:
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logging here is a side-effect of calling get_controllers which being called everywhere. As a result, we are simply logging at random places where it's not needed. So, removing it here

log_cgroup_info(msg)
controllers.append(controller)

return controllers
Expand Down Expand Up @@ -729,8 +727,6 @@ def get_controllers(self, expected_relative_path=None):
controller = MemoryControllerV2(self._cgroup_name, self._cgroup_path)

if controller is not None:
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller)
log_cgroup_info(msg)
controllers.append(controller)

return controllers
Expand Down
555 changes: 287 additions & 268 deletions azurelinuxagent/ga/cgroupconfigurator.py

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion azurelinuxagent/ga/cgroupcontroller.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def is_active(self):
' Internal error: {1}'.format(self.path, ustr(e)))
return False

def get_tracked_metrics(self, **_):
def get_tracked_metrics(self):
"""
Retrieves the current value of the metrics tracked for this controller/cgroup and returns them as an array.
"""
Expand All @@ -173,3 +173,9 @@ def get_unit_properties(self):
Returns a list of the unit properties to collect for the controller.
"""
raise NotImplementedError()

def get_controller_type(self):
"""
Returns the name of the controller.
"""
raise NotImplementedError()
14 changes: 2 additions & 12 deletions azurelinuxagent/ga/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,8 @@ class CGroupsTelemetry(object):
"""
"""
_tracked = {}
_track_throttled_time = False
_rlock = threading.RLock()

@staticmethod
def set_track_throttled_time(value):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

track throttled time by default as part of CPU metrics, no explicit enable

CGroupsTelemetry._track_throttled_time = value

@staticmethod
def get_track_throttled_time():
return CGroupsTelemetry._track_throttled_time

@staticmethod
def track_cgroup_controller(cgroup_controller):
"""
Expand Down Expand Up @@ -79,7 +70,7 @@ def poll_all_tracked():
with CGroupsTelemetry._rlock:
for controller in CGroupsTelemetry._tracked.values():
try:
metrics.extend(controller.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time))
metrics.extend(controller.get_tracked_metrics())
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand All @@ -98,5 +89,4 @@ def poll_all_tracked():
@staticmethod
def reset():
with CGroupsTelemetry._rlock:
CGroupsTelemetry._tracked.clear() # emptying the dictionary
CGroupsTelemetry._track_throttled_time = False
CGroupsTelemetry._tracked.clear() # emptying the dictionary
2 changes: 1 addition & 1 deletion azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def get_max_recorded_metrics(self):
def _poll_resource_usage(self):
metrics = []
for controller in self.controllers:
metrics.extend(controller.get_tracked_metrics(track_throttled_time=True))
metrics.extend(controller.get_tracked_metrics())

for metric in metrics:
current_max = self.max_recorded_metrics.get(metric.counter)
Expand Down
12 changes: 7 additions & 5 deletions azurelinuxagent/ga/cpucontroller.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,26 @@ def get_cpu_throttled_time(self, read_previous_throttled_time=True):
"""
raise NotImplementedError()

def get_tracked_metrics(self, **kwargs):
def get_tracked_metrics(self):
# Note: If the current cpu usage is less than the previous usage (metric is negative), then an empty array will
# be returned and the agent won't track the metrics.
tracked = []
cpu_usage = self.get_cpu_usage()
if cpu_usage >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))

if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
throttled_time = self.get_cpu_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
throttled_time = self.get_cpu_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))

return tracked

def get_unit_properties(self):
return ["CPUAccounting", "CPUQuotaPerSecUSec"]

def get_controller_type(self):
return "cpu"


class CpuControllerV1(_CpuController):
def initialize_cpu_usage(self):
Expand Down
14 changes: 7 additions & 7 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,11 +1337,10 @@ def set_extension_resource_limits(self):
# setup the resource limits for extension operations and it's services.
man = self.load_manifest()
resource_limits = man.get_resource_limits()
if not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(extension_name,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this check getting removed? Will we set up resource limits on each operation for the ext?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar check was added in cgroupconfigurator setup_slice method. We don't touch the configuration again if limits already in place

cpu_quota=resource_limits.get_extension_slice_cpu_quota()):
CGroupConfigurator.get_instance().setup_extension_slice(
extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())

CGroupConfigurator.get_instance().setup_extension_slice(
extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())

def create_status_file_if_not_exist(self, extension, status, code, operation, message):
_, status_path = self.get_status_file_path(extension)
Expand Down Expand Up @@ -1476,7 +1475,7 @@ def uninstall(self, extension=None):
resource_limits = man.get_resource_limits()
CGroupConfigurator.get_instance().stop_tracking_extension_services_cgroups(
resource_limits.get_service_list())
CGroupConfigurator.get_instance().remove_extension_services_drop_in_files(
CGroupConfigurator.get_instance().reset_extension_services_quota(
resource_limits.get_service_list())

uninstall_cmd = man.get_uninstall_command()
Expand All @@ -1502,8 +1501,9 @@ def on_rmtree_error(_, __, exc_info):

shutil.rmtree(base_dir, onerror=on_rmtree_error)

CGroupConfigurator.get_instance().stop_tracking_extension_cgroups(self.get_full_name())
self.logger.info("Remove the extension slice: {0}".format(self.get_full_name()))
CGroupConfigurator.get_instance().remove_extension_slice(
CGroupConfigurator.get_instance().reset_extension_quota(
extension_name=self.get_full_name())

except IOError as e:
Expand Down
3 changes: 3 additions & 0 deletions azurelinuxagent/ga/memorycontroller.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ def get_tracked_metrics(self, **_):
def get_unit_properties(self):
return["MemoryAccounting"]

def get_controller_type(self):
return "memory"


class MemoryControllerV1(_MemoryController):
def get_memory_usage(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/ga/test_cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an

for (distro, supported) in test_cases:
with patch("azurelinuxagent.ga.cgroupapi.get_distro", return_value=distro):
self.assertEqual(CGroupUtil.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro))
self.assertEqual(CGroupUtil.distro_supported(), supported, "cgroups_supported() failed on {0}".format(distro))


class SystemdCgroupsApiTestCase(AgentTestCase):
Expand Down
Loading
Loading