Skip to content

Commit

Permalink
FEAT: Slurm Deployment For Xorbits (#719)
Browse files Browse the repository at this point in the history
Signed-off-by: liddle rain <[email protected]>
Signed-off-by: liddle rain <[email protected]>
Co-authored-by: ahs <[email protected]>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: Aprilies <[email protected]>
Co-authored-by: liddle_rain <[email protected]>
  • Loading branch information
5 people authored Oct 31, 2023
1 parent b320ca3 commit e77db37
Show file tree
Hide file tree
Showing 14 changed files with 811 additions and 4 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ jobs:
- { os: self-hosted, module: gpu, python-version: 3.9}
- { os: ubuntu-latest, module: jax, python-version: 3.9 }
- { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 }
- { os: ubuntu-latest, module: slurm, python-version: 3.9 }
- { os: ubuntu-latest, module: datasets, python-version: 3.9 }
steps:
- name: Check out code
Expand Down Expand Up @@ -247,6 +248,18 @@ jobs:
python setup.py build_ext -i
working-directory: ./python

- name: Slurm Setup Job queuing system
if: ${{ matrix.module == 'slurm' }}
run: |
source CI/slurm/${{ matrix.module }}.sh
jobqueue_before_install
- name: Slurm Install xorbits
if: ${{ matrix.module == 'slurm' }}
run: |
source CI/slurm/${{ matrix.module }}.sh
jobqueue_install
- name: Install on GPU
if: ${{ matrix.module == 'gpu' }}
run: |
Expand Down Expand Up @@ -285,6 +298,11 @@ jobs:
pytest --ignore xorbits/_mars/ --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/kubernetes/external_storage/juicefs
elif [[ "$MODULE" == "slurm" ]]; then
docker exec c1 /bin/bash -c "pip install xorbits"
docker exec c2 /bin/bash -c "pip install xorbits"
docker exec slurmctld /bin/bash -c \
"pytest /xorbits/python/xorbits/deploy/slurm/tests/test_slurm.py "
elif [[ "$MODULE" == "hadoop" ]]; then
export WITH_HADOOP="1"
export HADOOP_HOME="/usr/local/hadoop"
Expand Down Expand Up @@ -376,6 +394,13 @@ jobs:
fi
working-directory: ./python


- name: Cleanup on slurm
if: ${{ matrix.module == 'slurm' }}
run: |
source CI/slurm/${{ matrix.module }}.sh
jobqueue_after_script
- name: Report coverage data
uses: codecov/codecov-action@v3
with:
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,7 @@ doc/source/savefig/

asv/results

.DS_Store
.DS_Store

#slrm.sh generated sh
python/xorbits/deploy/slurm/tests/slurm.sh
2 changes: 2 additions & 0 deletions CI/slurm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM daskdev/dask-jobqueue:slurm
RUN pip install xorbits
120 changes: 120 additions & 0 deletions CI/slurm/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
version: "2.2"

services:
mysql:
image: mysql:5.7.29
hostname: mysql
container_name: mysql
environment:
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
MYSQL_DATABASE: slurm_acct_db
MYSQL_USER: slurm
MYSQL_PASSWORD: password
volumes:
- var_lib_mysql:/var/lib/mysql
networks:
common-network:

slurmdbd:
image: daskdev/dask-jobqueue:slurm
build: .
command: ["slurmdbd"]
container_name: slurmdbd
hostname: slurmdbd
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- var_log_slurm:/var/log/slurm
expose:
- "6819"
depends_on:
- mysql
networks:
common-network:

slurmctld:
image: daskdev/dask-jobqueue:slurm
build: .
command: ["slurmctld"]
container_name: slurmctld
hostname: slurmctld
environment:
- CI_SHARED_SPACE=/shared_space
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- ../..:/xorbits
- shared_space:/shared_space
expose:
- "6817"
depends_on:
- "slurmdbd"
networks:
common-network:
ipv4_address: 10.1.1.10
cap_add:
- NET_ADMIN

c1:
image: daskdev/dask-jobqueue:slurm
build: .
command: ["slurmd"]
hostname: c1
container_name: c1
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- ../..:/xorbits
- shared_space:/shared_space
expose:
- "6818"
depends_on:
- "slurmctld"
networks:
common-network:
ipv4_address: 10.1.1.11
cap_add:
- NET_ADMIN

c2:
image: daskdev/dask-jobqueue:slurm
build: .
command: ["slurmd"]
hostname: c2
container_name: c2
volumes:
- etc_munge:/etc/munge
- etc_slurm:/etc/slurm
- slurm_jobdir:/data
- var_log_slurm:/var/log/slurm
- ../..:/xorbits
- shared_space:/shared_space
expose:
- "6818"
depends_on:
- "slurmctld"
networks:
common-network:
ipv4_address: 10.1.1.12
cap_add:
- NET_ADMIN

volumes:
etc_munge:
etc_slurm:
slurm_jobdir:
var_lib_mysql:
var_log_slurm:
shared_space:

networks:
common-network:
driver: bridge
ipam:
driver: default
config:
- subnet: 10.1.1.0/24
5 changes: 5 additions & 0 deletions CI/slurm/register_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set -e

docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
docker-compose restart slurmdbd slurmctld
98 changes: 98 additions & 0 deletions CI/slurm/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=slurmctld
ControlAddr=slurmctld
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=300
Waittime=30
#change this avoids low resource kill the process
#**log**
#srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
#slurmstepd: error: *** STEP 27.0 ON c1 CANCELLED AT 2023-09-25T06:30:54 ***

# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
AccountingStorageLoc=slurm_acct_db
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN
#
# PARTITIONS
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
60 changes: 60 additions & 0 deletions CI/slurm/slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env bash
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

function jobqueue_before_install {
docker version
docker-compose version

# start slurm cluster
cd ./CI/slurm
docker-compose pull
./start-slurm.sh
cd -

#Set shared space permissions
docker exec slurmctld /bin/bash -c "chmod -R 777 /shared_space"

docker ps -a
docker images
show_network_interfaces
}

function show_network_interfaces {
for c in slurmctld c1 c2; do
echo '------------------------------------------------------------'
echo docker container: $c
docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())'
echo '------------------------------------------------------------'
done
}

function jobqueue_install {
docker exec slurmctld /bin/bash -c "cd xorbits/python/; pip install -e ."
}

function jobqueue_script {
docker exec c1 /bin/bash -c "pip install xorbits"
docker exec c2 /bin/bash -c "pip install xorbits"
docker exec slurmctld /bin/bash -c \
"pytest --ignore xorbits/_mars/ --timeout=1500 \
-W ignore::PendingDeprecationWarning \
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/slurm"
}

function jobqueue_after_script {
docker exec slurmctld bash -c 'sinfo'
docker exec slurmctld bash -c 'squeue'
docker exec slurmctld bash -c 'sacct -l'
}
29 changes: 29 additions & 0 deletions CI/slurm/start-slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
docker-compose up -d --no-build

while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ]
do
echo "Waiting for SLURM cluster to become ready";
sleep 2
done
echo "SLURM properly configured"

# On some clusters the login node does not have the same interface as the
# compute nodes. The next three lines allow to test this edge case by adding
# separate interfaces on the worker and on the scheduler nodes.
docker exec slurmctld ip addr add 10.1.1.20/24 dev eth0 label eth0:scheduler
docker exec c1 ip addr add 10.1.1.21/24 dev eth0 label eth0:worker
docker exec c2 ip addr add 10.1.1.22/24 dev eth0 label eth0:worker
Loading

0 comments on commit e77db37

Please sign in to comment.