Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into to_public
Browse files Browse the repository at this point in the history
  • Loading branch information
banrieen committed Sep 24, 2020
2 parents c46263c + 9251ed5 commit 083b056
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
11 changes: 7 additions & 4 deletions src/ClusterManager/job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,9 @@ def TakeJobActions(data_handler, redis_conn, launcher, jobs):
singleJobInfo["globalResInfo"] = ResourceInfo({jobGpuType : GetJobTotalGpu(job_params)})
singleJobInfo["jobtrainingtype"] = job_params["jobtrainingtype"]
singleJobInfo["resourcegpu"] = job_params["resourcegpu"]
singleJobInfo["numpsworker"] = job_params["numpsworker"] if "numpsworker" in job_params else 1
singleJobInfo["pernoderesource"] = int(job_params["resourcegpu"])/int(job_params["numpsworker"])

# Job lists will be sorted based on and in the order of below
# 1. non-preemptible precedes preemptible
# 2. running precedes scheduling, precedes queued
Expand Down Expand Up @@ -639,10 +642,10 @@ def TakeJobActions(data_handler, redis_conn, launcher, jobs):
if not sji["preemptionAllowed"] and vc_resource.CanSatisfy(sji["globalResInfo"]) and vc_user_quota_resource.CanSatisfy(sji["globalResInfo"]):
if sji["job"]["jobStatus"] == "queued":
if sji["deviceType"] in detail_resources:
if sji["jobtrainingtype"] == "PSDistJob" and max(detail_resources[sji["deviceType"]]) < sji["resourcegpu"]:
if sji["jobtrainingtype"] == "PSDistJob" and quota.caculate_n_th_max(detail_resources[sji["deviceType"]],sji["numpsworker"]) < sji["pernoderesource"]:
continue
else:
if sji["jobtrainingtype"] != "PSDistJob" and max(detail_resources[sji["deviceType"]]) < (sji["globalResInfo"].CategoryToCountMap)[sji["deviceType"]]:
if sji["jobtrainingtype"] != "PSDistJob" and max(detail_resources[sji["deviceType"]]) < sji["pernoderesource"]:
continue
vc_resource.Subtract(sji["globalResInfo"])
vc_user_quota_resource.Subtract(sji["globalResInfo"])
Expand All @@ -658,10 +661,10 @@ def TakeJobActions(data_handler, redis_conn, launcher, jobs):
logger.info([sji["jobtrainingtype"], detail_resources,sji["deviceType"], sji["resourcegpu"],(sji["globalResInfo"].CategoryToCountMap)[sji["deviceType"]]])
if sji["job"]["jobStatus"] == "queued":
if sji["deviceType"] in detail_resources:
if sji["jobtrainingtype"] == "PSDistJob" and max(detail_resources[sji["deviceType"]]) < sji["resourcegpu"]:
if sji["jobtrainingtype"] == "PSDistJob" and quota.caculate_n_th_max(detail_resources[sji["deviceType"]],sji["numpsworker"]) < sji["pernoderesource"]:
continue
else:
if sji["jobtrainingtype"] != "PSDistJob" and max(detail_resources[sji["deviceType"]]) < (sji["globalResInfo"].CategoryToCountMap)[sji["deviceType"]]:
if sji["jobtrainingtype"] != "PSDistJob" and max(detail_resources[sji["deviceType"]]) < sji["pernoderesource"]:
continue
logger.info("TakeJobActions : job : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))
# Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible.
Expand Down
2 changes: 1 addition & 1 deletion src/ClusterManager/node_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def get_cluster_status():
if node_status["gpuType"] and node_status["gpuType"] in gpuMapping:
gpuMapping[node_status["gpuType"]]["detail"].append({"nodeName": node_name,
"capacity": int(node_status["gpu_capacity"][node_status["gpuType"]]) if node_status["gpuType"] in node_status["gpu_capacity"] else 0,
"allocatable": max(int(node_status["gpu_allocatable"][node_status["gpuType"]])-int(node_status["gpu_used"][node_status["gpuType"]]) if node_status["gpuType"] in node_status["gpu_used"] else 0,0)
"allocatable": max(int(node_status["gpu_allocatable"][node_status["gpuType"]])-(int(node_status["gpu_used"][node_status["gpuType"]]) if node_status["gpuType"] in node_status["gpu_used"] else 0),0)
if node_status["gpuType"] in node_status["gpu_allocatable"] else 0})
gpu_used.Add(ResourceInfo(node_status["gpu_used"]))
gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"]))
Expand Down

0 comments on commit 083b056

Please sign in to comment.