diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index a2f5f0b..5d315b2 100755 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -318,17 +318,17 @@ def get_finished_job_stats(jobid, cluster): sacct = get_slurm_location('sacct') if cluster != "": sacct += " -M %s" % cluster - log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + log("Querying sacct for completed job for jobid: %s" % (jobid)) # List of attributes required from sacct - attributes = "UserCPU,SystemCPU,MaxRSS,ExitCode" - child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, str(jobid), attributes)) + attributes = "JobID,UserCPU,SystemCPU,MaxRSS,ExitCode" + child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, jobid, attributes)) sacct_data = child_stdout.readlines() ret = child_stdout.close() if ret: # retry without --noconvert for slurm < 15.8 - child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, str(jobid), attributes)) + child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, jobid, attributes)) sacct_data = child_stdout.readlines() child_stdout.close() @@ -338,22 +338,26 @@ def get_finished_job_stats(jobid, cluster): log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict - # Slurm can return more than 1 row, for some odd reason. - # so sum up relevant values + # Slurm can return multiple rows, one for the overall job and + # others for portions of the job (one for each srun invocation and + # 'batch' for the non-srun parts). for row in reader: - if row["UserCPU"] != "": + # Take CPU usage values from the overall job line + if row["UserCPU"] != "" and row["JobID"] == jobid: try: return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["UserCPU"]) except: log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["UserCPU"])) raise - if row["SystemCPU"] != "": + if row["SystemCPU"] != "" and row["JobID"] == jobid: try: return_dict['RemoteSysCpu'] += convert_cpu_to_seconds(row["SystemCPU"]) except: log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["SystemCPU"])) - raise + raise + + # Take the largest value of MaxRSS across all lines if row["MaxRSS"] != "": # Remove the trailing [KMGTP] and scale the value appropriately # Note: We assume that all values will have a suffix, and we @@ -375,11 +379,15 @@ def get_finished_job_stats(jobid, cluster): else: # The last value is not a letter (or unrecognized scaling factor), and is in bytes, convert to k value = str(int(value) / 1024) - return_dict["ImageSize"] += int(float(value.strip('KMGTP'))) * factor + mem_kb = int(float(value.strip('KMGTP'))) * factor + if mem_kb > return_dict["ImageSize"]: + return_dict["ImageSize"] = mem_kb except: log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) raise - if row["ExitCode"] != "": + + # Take ExitCode from the overall job line + if row["ExitCode"] != "" and row["JobID"] == jobid: try: return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) except: