Skip to content

Commit

Permalink
Merge pull request htcondor#60 from JaimeFrey/HTCONDOR-613-slurm-reso…
Browse files Browse the repository at this point in the history
…urce-reporting

Fix slurm resource usage reporting. HTCONDOR-613
  • Loading branch information
JaimeFrey authored Aug 2, 2021
2 parents 1adbfd0 + ff70397 commit b30f6bf
Showing 1 changed file with 19 additions and 11 deletions.
30 changes: 19 additions & 11 deletions src/scripts/slurm_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,17 +318,17 @@ def get_finished_job_stats(jobid, cluster):
sacct = get_slurm_location('sacct')
if cluster != "":
sacct += " -M %s" % cluster
log("Querying sacct for completed job for jobid: %s" % (str(jobid)))
log("Querying sacct for completed job for jobid: %s" % (jobid))

# List of attributes required from sacct
attributes = "UserCPU,SystemCPU,MaxRSS,ExitCode"
child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, str(jobid), attributes))
attributes = "JobID,UserCPU,SystemCPU,MaxRSS,ExitCode"
child_stdout = os.popen("%s -j %s --noconvert -P --format %s" % (sacct, jobid, attributes))
sacct_data = child_stdout.readlines()
ret = child_stdout.close()

if ret:
# retry without --noconvert for slurm < 15.8
child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, str(jobid), attributes))
child_stdout = os.popen("%s -j %s -P --format %s" % (sacct, jobid, attributes))
sacct_data = child_stdout.readlines()
child_stdout.close()

Expand All @@ -338,22 +338,26 @@ def get_finished_job_stats(jobid, cluster):
log("Unable to read in CSV output from sacct: %s" % str(e))
return return_dict

# Slurm can return more than 1 row, for some odd reason.
# so sum up relevant values
# Slurm can return multiple rows, one for the overall job and
# others for portions of the job (one for each srun invocation and
# 'batch' for the non-srun parts).
for row in reader:
if row["UserCPU"] != "":
# Take CPU usage values from the overall job line
if row["UserCPU"] != "" and row["JobID"] == jobid:
try:
return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["UserCPU"])
except:
log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["UserCPU"]))
raise

if row["SystemCPU"] != "":
if row["SystemCPU"] != "" and row["JobID"] == jobid:
try:
return_dict['RemoteSysCpu'] += convert_cpu_to_seconds(row["SystemCPU"])
except:
log("Failed to parse CPU usage for job id %s: %s" % (jobid, row["SystemCPU"]))
raise
raise

# Take the largest value of MaxRSS across all lines
if row["MaxRSS"] != "":
# Remove the trailing [KMGTP] and scale the value appropriately
# Note: We assume that all values will have a suffix, and we
Expand All @@ -375,11 +379,15 @@ def get_finished_job_stats(jobid, cluster):
else:
# The last value is not a letter (or unrecognized scaling factor), and is in bytes, convert to k
value = str(int(value) / 1024)
return_dict["ImageSize"] += int(float(value.strip('KMGTP'))) * factor
mem_kb = int(float(value.strip('KMGTP'))) * factor
if mem_kb > return_dict["ImageSize"]:
return_dict["ImageSize"] = mem_kb
except:
log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
raise
if row["ExitCode"] != "":

# Take ExitCode from the overall job line
if row["ExitCode"] != "" and row["JobID"] == jobid:
try:
return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
except:
Expand Down

0 comments on commit b30f6bf

Please sign in to comment.