Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HPCC-30118 Fix numWorkersPerPod, workers should not split resourced memory #17686

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions helm/hpcc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -753,13 +753,15 @@ roxie:
#ldapUser: roxie_file_access #add system username for accessing files
#egress: engineEgress

## The [manager/worker/eclAgent]Resources define the resource limits for each pod.
## workerMemory defines the memory requirements for each individual worker.
## If numWorkersPerPod is >1 (must be a factor of numWorkers) then the memory
## is divided evenly between the workers.
## In the absence of a workerMemory.query specification, the resourced pod memory
## will be split evenly between the workers in each worker pod.
## NB: numWorkersPerPod must be a factor of numWorkers
## The [manager/worker/eclAgent]Resources define the resource limits for each container.
## If numWorkersPerPod is >1 (must be a factor of numWorkers).
## NB: Each worker corresponds to a container, that will be resourced according to
## workerResources, meaning that if numWorkersPerPod>1, N * workerResources.cpu,
## N * workerResources.memory etc., will be required in total for the pod.
##
## By default the available Thor memory will be based on the resourced container memory.
## This can be overriden by setting [worker/manager]Memory.query and
## [worker/manager]Memory.thirdParty.
thor:
- name: thor
prefix: thor
Expand Down
23 changes: 10 additions & 13 deletions thorlcr/master/thmastermain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,8 +814,6 @@ int main( int argc, const char *argv[] )
if (!managerMemory->hasProp("@maxMemPercentage"))
managerMemory->setPropReal("@maxMemPercentage", localThor ? 25.0 : defaultPctSysMemForRoxie);
}
// NB: if (cloud - numWorkersPerPod) or (bare-metal - slavesPerNode) is specified
// the percentage will be split based on numWorkersPerPod or slavesPerNode (see if (numWorkersPerPodOrNode > 1) code below)
}
workerMemory->setPropInt("@total", gmemSize);

Expand Down Expand Up @@ -970,7 +968,6 @@ int main( int argc, const char *argv[] )
kjServiceMpTag = allocateClusterMPTag();

unsigned numWorkers = 0;
unsigned numWorkersPerPodOrNode = 1; // pod in cloud, node in bare-metal
bool doWorkerRegistration = false;
if (isContainerized())
{
Expand Down Expand Up @@ -1009,7 +1006,6 @@ int main( int argc, const char *argv[] )
Owned<IWorkUnit> workunit = &wuRead->lock();
addTimeStamp(workunit, wfid, graphName, StWhenK8sStarted);
}
numWorkersPerPodOrNode = numWorkersPerPod;

cloudJobName.appendf("%s-%s", workunit, graphName);

Expand All @@ -1033,17 +1029,18 @@ int main( int argc, const char *argv[] )
unsigned localThorPortInc = globals->getPropInt("@localThorPortInc", DEFAULT_SLAVEPORTINC);
unsigned slaveBasePort = globals->getPropInt("@slaveport", DEFAULT_THORSLAVEPORT);
Owned<IGroup> rawGroup = getClusterNodeGroup(thorname, "ThorCluster");
numWorkersPerPodOrNode = globals->getPropInt("@slavesPerNode", 1);
setClusterGroup(queryMyNode(), rawGroup, numWorkersPerPodOrNode, channelsPerWorker, slaveBasePort, localThorPortInc);
unsigned numWorkersPerNode = globals->getPropInt("@slavesPerNode", 1);
setClusterGroup(queryMyNode(), rawGroup, numWorkersPerNode, channelsPerWorker, slaveBasePort, localThorPortInc);
numWorkers = queryNodeClusterWidth();
doWorkerRegistration = true;
}
if (numWorkersPerPodOrNode > 1)
{
// NB: maxMemPercentage only be set when memory amounts have not explicily been defined (e.g. globalMemorySize)
double pct = workerMemory->getPropReal("@maxMemPercentage");
if (pct)
workerMemory->setPropReal("@maxMemPercentage", pct / numWorkersPerPodOrNode);
if (numWorkersPerNode > 1)
{
// Split memory based on numWorkersPerNode
// NB: maxMemPercentage only set when memory amounts have not explicily been defined (e.g. globalMemorySize)
double pct = workerMemory->getPropReal("@maxMemPercentage");
if (pct)
workerMemory->setPropReal("@maxMemPercentage", pct / numWorkersPerNode);
}
}

if (doWorkerRegistration && registry->connect(numWorkers))
Expand Down
Loading