Skip to content

Commit

Permalink
Merge pull request #2469 from Azure/release-2.6.0.0
Browse files Browse the repository at this point in the history
Merge release 2.6.0.0 into master
  • Loading branch information
kevinclark19a authored Jan 14, 2022
2 parents d2f61b0 + 5dd5abe commit 73e1ce6
Show file tree
Hide file tree
Showing 121 changed files with 7,035 additions and 1,271 deletions.
30 changes: 25 additions & 5 deletions azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from azurelinuxagent.common.conf import get_agent_pid_file_path
from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, ExtensionOperationError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import systemd
from azurelinuxagent.common.utils import fileutil, shellutil
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
TELEMETRY_MESSAGE_MAX_LEN
Expand Down Expand Up @@ -201,6 +202,24 @@ def get_process_cgroup_paths(self, process_id):

return cpu_cgroup_path, memory_cgroup_path

def get_unit_cgroup_paths(self, unit_name):
"""
Returns a tuple with the path of the cpu and memory cgroups for the given unit.
The values returned can be None if the controller is not mounted.
Ex: ControlGroup=/azure.slice/walinuxagent.service
controlgroup_path[1:] = azure.slice/walinuxagent.service
"""
controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup")
cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points()

cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \
if cpu_mount_point is not None else None

memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \
if memory_mount_point is not None else None

return cpu_cgroup_path, memory_cgroup_path

@staticmethod
def get_cgroup2_controllers():
"""
Expand Down Expand Up @@ -230,16 +249,16 @@ def _is_systemd_failure(scope_name, stderr):
return unit_not_found in stderr or scope_name not in stderr

@staticmethod
def get_extension_cgroup_name(extension_name):
def get_extension_slice_name(extension_name):
# Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_')
return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice"

def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure):
scope = "{0}_{1}".format(cmd_name, uuid.uuid4())
extension_slice_name = self.get_extension_cgroup_name(extension_name)
extension_slice_name = self.get_extension_slice_name(extension_name)
with self._systemd_run_commands_lock:
process = subprocess.Popen( # pylint: disable=W1509
"systemd-run --unit={0} --scope --slice={1}.slice {2}".format(scope, extension_slice_name, command),
"systemd-run --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command),
shell=shell,
cwd=cwd,
stdout=stdout,
Expand All @@ -255,7 +274,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh
logger.info("Started extension in unit '{0}'", scope_name)

try:
cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name + ".slice")
cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name)

cpu_cgroup_mountpoint, _ = self.get_cgroup_mount_points()

Expand All @@ -264,6 +283,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh
else:
cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
CGroupsTelemetry.track_cgroup(CpuCgroup(extension_name, cpu_cgroup_path))

except IOError as e:
if e.errno == 2: # 'No such file or directory'
logger.info("The extension command already completed; will not track resource usage")
Expand Down
233 changes: 198 additions & 35 deletions azurelinuxagent/common/cgroupconfigurator.py

Large diffs are not rendered by default.

28 changes: 15 additions & 13 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
class CGroupsTelemetry(object):
"""
"""
_tracked = []
_tracked = {}
_track_throttled_time = False
_rlock = threading.RLock()

Expand All @@ -47,37 +47,37 @@ def track_cgroup(cgroup):

with CGroupsTelemetry._rlock:
if not CGroupsTelemetry.is_tracked(cgroup.path):
CGroupsTelemetry._tracked.append(cgroup)
CGroupsTelemetry._tracked[cgroup.path] = cgroup
logger.info("Started tracking cgroup {0}", cgroup)

@staticmethod
def is_tracked(path):
"""
Returns true if the given item is in the list of tracked items
O(n) operation. But limited to few cgroup objects we have.
O(1) operation.
"""
with CGroupsTelemetry._rlock:
for cgroup in CGroupsTelemetry._tracked:
if path == cgroup.path:
return True
if path in CGroupsTelemetry._tracked:
return True

return False

@staticmethod
def stop_tracking(cgroup):
"""
Stop tracking the cgroups for the given name
Stop tracking the cgroups for the given path
"""
with CGroupsTelemetry._rlock:
CGroupsTelemetry._tracked.remove(cgroup)
logger.info("Stopped tracking cgroup {0}", cgroup)
if cgroup.path in CGroupsTelemetry._tracked:
CGroupsTelemetry._tracked.pop(cgroup.path)
logger.info("Stopped tracking cgroup {0}", cgroup)

@staticmethod
def poll_all_tracked():
metrics = []

inactive_cgroups = []
with CGroupsTelemetry._rlock:
for cgroup in CGroupsTelemetry._tracked[:]:
for cgroup in CGroupsTelemetry._tracked.values():
try:
metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time))
except Exception as e:
Expand All @@ -89,12 +89,14 @@ def poll_all_tracked():
logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup '
'{0}. Error : {1}'.format(cgroup.name, ustr(e)))
if not cgroup.is_active():
CGroupsTelemetry.stop_tracking(cgroup)
inactive_cgroups.append(cgroup)
for inactive_cgroup in inactive_cgroups:
CGroupsTelemetry.stop_tracking(inactive_cgroup)

return metrics

@staticmethod
def reset():
with CGroupsTelemetry._rlock:
CGroupsTelemetry._tracked *= 0 # emptying the list
CGroupsTelemetry._tracked.clear() # emptying the dictionary
CGroupsTelemetry._track_throttled_time = False
42 changes: 40 additions & 2 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Debug.CgroupLogMetrics": False,
"Debug.CgroupDisableOnProcessCheckFailure": True,
"Debug.CgroupDisableOnQuotaCheckFailure": True,
"Debug.EnableFastTrack": False,
"Debug.EnableFastTrack": True,
}


Expand All @@ -159,6 +159,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"ResourceDisk.MountOptions": None,
"ResourceDisk.Filesystem": "ext3",
"AutoUpdate.GAFamily": "Prod",
"Debug.CgroupMonitorExpiryTime": "2022-01-31",
"Debug.CgroupMonitorExtensionName": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent",
}


Expand All @@ -182,6 +184,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
# versions of the Agent.
#
"Debug.CgroupCheckPeriod": 300,
"Debug.AgentCpuQuota": 75,
"Debug.EtpCollectionPeriod": 300
}


Expand Down Expand Up @@ -532,10 +536,44 @@ def get_cgroup_disable_on_quota_check_failure(conf=__conf__):
"""
return conf.get_switch("Debug.CgroupDisableOnQuotaCheckFailure", True)


def get_agent_cpu_quota(conf=__conf__):
"""
CPU quota for the agent as a percentage of 1 CPU (100% == 1 CPU)
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_int("Debug.AgentCpuQuota", 75)

def get_cgroup_monitor_expiry_time (conf=__conf__):
"""
cgroups monitoring disabled after expiry time
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get("Debug.CgroupMonitorExpiryTime", "2022-01-31")

def get_cgroup_monitor_extension_name (conf=__conf__):
"""
cgroups monitoring extension name
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get("Debug.CgroupMonitorExtensionName", "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent")


def get_enable_fast_track(conf=__conf__):
"""
If True, the agent use FastTrack when retrieving goal states
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_switch("Debug.EnableFastTrack", False)
return conf.get_switch("Debug.EnableFastTrack", True)


def get_etp_collection_period(conf=__conf__):
"""
Determines the frequency to perform ETP collection on extensions telemetry events.
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_int("Debug.EtpCollectionPeriod", 300)
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ class WALAEventOperation:
Unknown = "Unknown"
Upgrade = "Upgrade"
Update = "Update"
VmSettings = "VmSettings"


SHOULD_ENCODE_MESSAGE_LEN = 80
Expand Down
16 changes: 14 additions & 2 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,21 @@ class ExtensionDownloadError(ExtensionError):
"""


class ExtensionConfigError(ExtensionError):
class ExtensionsGoalStateError(ExtensionError):
"""
Error raised when extension config file is malformed
Error raised when the ExtensionsGoalState is malformed
"""


class ExtensionsConfigError(ExtensionsGoalStateError):
"""
Error raised when the ExtensionsConfig is malformed
"""


class VmSettingsError(ExtensionsGoalStateError):
"""
Error raised when the VmSettings are malformed
"""


Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/common/logcollector_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@
echo,### Gathering Disk Info ###
diskinfo,
"""
"""
4 changes: 2 additions & 2 deletions azurelinuxagent/common/osutil/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _get_osutil(distro_name, distro_code_name, distro_version, distro_full_name)
if distro_name == "kali":
return DebianOSBaseUtil()

if distro_name == "coreos" or distro_code_name == "coreos":
if distro_name in ("flatcar", "coreos") or distro_code_name in ("flatcar", "coreos"):
return CoreOSUtil()

if distro_name in ("suse", "sle_hpc", "sles", "opensuse"):
Expand All @@ -98,7 +98,7 @@ def _get_osutil(distro_name, distro_code_name, distro_version, distro_full_name)

return DebianOSBaseUtil()

if distro_name in ("redhat", "rhel", "centos", "oracle", "almalinux"):
if distro_name in ("redhat", "rhel", "centos", "oracle", "almalinux", "cloudlinux"):
if Version(distro_version) < Version("7"):
return Redhat6xOSUtil()

Expand Down
Loading

0 comments on commit 73e1ce6

Please sign in to comment.