Merge pull request #64 from brianhlin/sw2929_fix_mem_parsing

Fix Slurm/PBS memory parsing (SOFTWARE-2929)
prelz · Oct 30, 2017 · 5fa75e4 · 5fa75e4
2 parents af091f9 + e4dbcd4
commit 5fa75e4
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 22 deletions.
diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py
@@ -330,25 +330,48 @@ def get_finished_job_stats(jobid):
         except Exception, e:
             log("Unable to read in CSV output from sacct: %s" % str(e))
             return return_dict
-
-        sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \
-                        convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]),
-                        'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')),
-                        'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])}
+
         # Slurm can return more than 1 row, for some odd reason.
         # so sum up relevant values
         for row in reader:
-            for attr, func in sacct_parser.items():
+            if row["AveCPU"] is not "":
                 try:
-                    return_dict[attr] = func(return_dict[attr], row)
-                except (ValueError, KeyError), exc:
-                    log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc))
-
-    # PBS completion
+                    return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"])
+                except:
+                    log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"]))
+                    raise
+            if row["MaxRSS"] is not "":
+                # Remove the trailing [KMGTP] and scale the value appropriately
+                # Note: We assume that all values will have a suffix, and we
+                #   want the value in kilos.
+                try:
+                    value = row["MaxRSS"]
+                    factor = 1
+                    if value[-1] == 'M':
+                        factor = 1024
+                    elif value[-1] == 'G':
+                        factor = 1024 * 1024
+                    elif value[-1] == 'T':
+                        factor = 1024 * 1024 * 1024
+                    elif value[-1] == 'P':
+                        factor = 1024 * 1024 * 1024 * 1024
+                        return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor
+                except:
+                    log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
+                    raise
+            if row["ExitCode"] is not "":
+                try:
+                    return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
+                except:
+                    log("Failed to parse ExitCode for job id %s: %s" % (jobid, row["ExitCode"]))
+                    raise
+
+    # PBS completion        
     elif _cluster_type_cache == "pbs":
         pass
 
     return return_dict
+
 
 _qstat_location_cache = None
 def get_qstat_location():

diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py
@@ -319,22 +319,43 @@ def get_finished_job_stats(jobid):
     except Exception, e:
         log("Unable to read in CSV output from sacct: %s" % str(e))
         return return_dict
-
-    sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \
-                    convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]),
-                    'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')),
-                    'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])}
+
     # Slurm can return more than 1 row, for some odd reason.
     # so sum up relevant values
     for row in reader:
-        for attr, func in sacct_parser.items():
+        if row["AveCPU"] is not "":
             try:
-                return_dict[attr] = func(return_dict[attr], row)
-            except (ValueError, KeyError), exc:
-                log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc))
-
+                return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"])
+            except:
+                log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"]))
+                raise                
+        if row["MaxRSS"] is not "":
+            # Remove the trailing [KMGTP] and scale the value appropriately
+            # Note: We assume that all values will have a suffix, and we
+            #   want the value in kilos.
+            try:
+                value = row["MaxRSS"]
+                factor = 1
+                if value[-1] == 'M':
+                    factor = 1024
+                elif value[-1] == 'G':
+                    factor = 1024 * 1024
+                elif value[-1] == 'T':
+                    factor = 1024 * 1024 * 1024
+                elif value[-1] == 'P':
+                    factor = 1024 * 1024 * 1024 * 1024
+                    return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor
+            except:
+                log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
+                raise
+        if row["ExitCode"] is not "":
+            try:
+                return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
+            except:
+                log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
+                raise
     return return_dict
-
+    
 
 _slurm_location_cache = None
 def get_slurm_location(program):