From b27645cd5e58f0c5d0ae0e1f35ea6bc38f66c39b Mon Sep 17 00:00:00 2001 From: Todor Ivanov Date: Tue, 25 Jul 2023 19:13:02 +0200 Subject: [PATCH 1/4] Switch to psutils for resource utilization monitoring at runtime. Typo && Remove unneded import of SubprocessAlgos --- .../WMRuntime/Monitors/PerformanceMonitor.py | 48 +++++++------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py index 4508786a7a..5af926b898 100644 --- a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py +++ b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py @@ -14,8 +14,8 @@ import os.path import signal import time +import psutils -import WMCore.Algorithms.SubprocessAlgos as subprocessAlgos import WMCore.FwkJobReport.Report as Report from WMCore.WMException import WMException from WMCore.WMRuntime.Monitors.WMRuntimeMonitor import WMRuntimeMonitor @@ -84,8 +84,6 @@ def __init__(self): self.pid = None self.uid = os.getuid() - self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i" - self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None @@ -208,38 +206,24 @@ def periodicUpdate(self): # Then we have no step PID, we can do nothing return - # Now we run the ps monitor command and collate the data - # Gathers RSS, %CPU and %MEM statistics from ps - ps_cmd = self.monitorBase % (stepPID, stepPID) - stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) - - ps_output = stdout.split() - if not len(ps_output) > 6: - # Then something went wrong in getting the ps data - msg = "Error when grabbing output from process ps\n" - msg += "output = %s\n" % ps_output - msg += "command = %s\n" % ps_cmd - logging.error(msg) - return - - # run the command to gather PSS memory statistics from /proc//smaps - smaps_cmd = self.pssMemoryCommand % (stepPID) - stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) - - smaps_output = stdout.split() - if not len(smaps_output) == 1: - # Then something went wrong in getting the smaps data - msg = "Error when grabbing output from smaps\n" - msg += "output = %s\n" % smaps_output - msg += "command = %s\n" % smaps_cmd - logging.error(msg) + # Now we run the psutil module and collate some process data. + # We gather info about CPU system time and Memory statistics - VMS, PSS, RSS + stepProc = psutils.Process(stepPID) + if not stepProc.is_running(): + # The step process has vanished before we managed to finish the current periodic update + msg = " The step process with PID: %s of STEP: %s " + msg += "has finished execution before the last periodic resource update.\n" + logging.warning(msg, stepPID, self.currentStepName) return - # smaps also returns data in kiloBytes, let's make it megaBytes - # I'm also confused with these megabytes and mebibytes... - pss = int(smaps_output[0]) // 1000 + stepCmd = stepProc.cmdline() + stepMemInfo = stepProc.memory_full_info() + stepCpuInfo = stepProc.cpu_times() + # NOTE: All the information from psutil.memory_*info() comes in Bytes + # we need to make it in MegaBytes + pss = int(stepMemInfo.pss) // (1000**2) - logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) + logging.info("PID: %s; VirtMEM: %s, PSS: %s; RSS: %s; SystemTime: %s;", stepPID, stepMemInfo.vms, stepMemInfo.pss, stepMemInfo.rss, stepCpuInfo.system) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores From 1aebd82000f79ab838e9e9c4deb609ad7c97e95a Mon Sep 17 00:00:00 2001 From: Todor Ivanov Date: Tue, 25 Jul 2023 21:01:53 +0200 Subject: [PATCH 2/4] Fetch process statistics in one shot --- .../WMCore/WMRuntime/Monitors/PerformanceMonitor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py index 5af926b898..6324ee550d 100644 --- a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py +++ b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py @@ -216,9 +216,11 @@ def periodicUpdate(self): logging.warning(msg, stepPID, self.currentStepName) return - stepCmd = stepProc.cmdline() - stepMemInfo = stepProc.memory_full_info() - stepCpuInfo = stepProc.cpu_times() + with stepProc.oneshot(): + stepCmd = stepProc.cmdline() + stepMemInfo = stepProc.memory_full_info() + stepCpuInfo = stepProc.cpu_times() + # NOTE: All the information from psutil.memory_*info() comes in Bytes # we need to make it in MegaBytes pss = int(stepMemInfo.pss) // (1000**2) From 1d5fa1d9884fa0e9d3af032e01cbe985fb5fa58a Mon Sep 17 00:00:00 2001 From: Todor Ivanov Date: Wed, 26 Jul 2023 11:31:06 +0200 Subject: [PATCH 3/4] Wrong module name --- src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py index 6324ee550d..01bbd39de4 100644 --- a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py +++ b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py @@ -14,7 +14,7 @@ import os.path import signal import time -import psutils +import psutil import WMCore.FwkJobReport.Report as Report from WMCore.WMException import WMException @@ -208,7 +208,7 @@ def periodicUpdate(self): # Now we run the psutil module and collate some process data. # We gather info about CPU system time and Memory statistics - VMS, PSS, RSS - stepProc = psutils.Process(stepPID) + stepProc = psutil.Process(stepPID) if not stepProc.is_running(): # The step process has vanished before we managed to finish the current periodic update msg = " The step process with PID: %s of STEP: %s " From e62d81614a4a156a1e8d6872fc8c38c2a3fc7392 Mon Sep 17 00:00:00 2001 From: Todor Ivanov Date: Wed, 26 Jul 2023 13:24:36 +0200 Subject: [PATCH 4/4] Add UserTime to the step log message --- src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py index 01bbd39de4..60a5a5682e 100644 --- a/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py +++ b/src/python/WMCore/WMRuntime/Monitors/PerformanceMonitor.py @@ -225,7 +225,7 @@ def periodicUpdate(self): # we need to make it in MegaBytes pss = int(stepMemInfo.pss) // (1000**2) - logging.info("PID: %s; VirtMEM: %s, PSS: %s; RSS: %s; SystemTime: %s;", stepPID, stepMemInfo.vms, stepMemInfo.pss, stepMemInfo.rss, stepCpuInfo.system) + logging.info("PID: %s; VirtMEM: %s; PSS: %s; RSS: %s; SystemTime: %s; UserTime: %s;", stepPID, stepMemInfo.vms, stepMemInfo.pss, stepMemInfo.rss, stepCpuInfo.system, stepCpuInfo.user) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores