Merge pull request htcondor#60 from JaimeFrey/HTCONDOR-613-slurm-reso…

…urce-reporting Fix slurm resource usage reporting. HTCONDOR-613
JaimeFrey · Aug 2, 2021 · b30f6bf · b30f6bf
2 parents 1adbfd0 + ff70397
commit b30f6bf
Showing 1 changed file with 19 additions and 11 deletions.
diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py
@@ -318,17 +318,17 @@ def get_finished_job_stats(jobid, cluster):
     sacct = get_slurm_location('sacct')
     if cluster != "":
         sacct += " -M %s" % cluster
-    log("Querying sacct for completed job for jobid: %s" % (str(jobid)))
+    log("Querying sacct for completed job for jobid: %s" % (jobid))
 
     # List of attributes required from sacct
-    attributes = "UserCPU,SystemCPU,MaxRSS,ExitCode"
-    child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, str(jobid), attributes))
+    attributes = "JobID,UserCPU,SystemCPU,MaxRSS,ExitCode"
+    child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, jobid, attributes))
     sacct_data = child_stdout.readlines()
     ret = child_stdout.close()
 
     if ret:
         # retry without --noconvert for slurm < 15.8
-        child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, str(jobid), attributes))
+        child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, jobid, attributes))
         sacct_data = child_stdout.readlines()
         child_stdout.close()
 
@@ -338,22 +338,26 @@ def get_finished_job_stats(jobid, cluster):
         log("Unable to read in CSV output from sacct: %s" % str(e))
         return return_dict
 
-    # Slurm can return more than 1 row, for some odd reason.
-    # so sum up relevant values
+    # Slurm can return multiple rows, one for the overall job and
+    # others for portions of the job (one for each srun invocation and
+    # 'batch' for the non-srun parts).
     for row in reader:
-        if row["UserCPU"] != "":
+        # Take CPU usage values from the overall job line
+        if row["UserCPU"] != "" and row["JobID"] == jobid:
             try:
                 return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["UserCPU"])
             except:
                 log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["UserCPU"]))
                 raise
 
-        if row["SystemCPU"] != "":
+        if row["SystemCPU"] != "" and row["JobID"] == jobid:
             try:
                 return_dict['RemoteSysCpu'] += convert_cpu_to_seconds(row["SystemCPU"])
             except:
                 log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["SystemCPU"]))
-                raise   
+                raise
+
+        # Take the largest value of MaxRSS across all lines
         if row["MaxRSS"] != "":
             # Remove the trailing [KMGTP] and scale the value appropriately
             # Note: We assume that all values will have a suffix, and we
@@ -375,11 +379,15 @@ def get_finished_job_stats(jobid, cluster):
                 else:
                     # The last value is not a letter (or unrecognized scaling factor), and is in bytes, convert to k
                     value = str(int(value) / 1024)
-                return_dict["ImageSize"] += int(float(value.strip('KMGTP'))) * factor
+                mem_kb = int(float(value.strip('KMGTP'))) * factor
+                if mem_kb > return_dict["ImageSize"]:
+                    return_dict["ImageSize"] = mem_kb
             except:
                 log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
                 raise
-        if row["ExitCode"] != "":
+
+        # Take ExitCode from the overall job line
+        if row["ExitCode"] != "" and row["JobID"] == jobid:
             try:
                 return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
             except: