From 5e71c9f25e8e775c9b84e31644de00d13001af7b Mon Sep 17 00:00:00 2001 From: ahs Date: Sun, 24 Sep 2023 21:36:25 -0700 Subject: [PATCH 01/59] support for slurm deployment --- .github/workflows/cluster.yaml | 60 ++++++++ CI/slurm.sh | 43 ++++++ CI/slurm/Dockerfile | 2 + CI/slurm/docker-compose.yml | 120 ++++++++++++++++ CI/slurm/register_cluster.sh | 5 + CI/slurm/slurm.conf | 94 ++++++++++++ CI/slurm/start-slurm.sh | 17 +++ python/xorbits/cluster/Slurm.py | 174 +++++++++++++++++++++++ python/xorbits/cluster/__init__.py | 15 ++ python/xorbits/cluster/slurm-template.sh | 42 ++++++ python/xorbits/cluster/slurm.sh | 27 ++++ python/xorbits/cluster/test.py | 11 ++ 12 files changed, 610 insertions(+) create mode 100644 .github/workflows/cluster.yaml create mode 100644 CI/slurm.sh create mode 100644 CI/slurm/Dockerfile create mode 100644 CI/slurm/docker-compose.yml create mode 100755 CI/slurm/register_cluster.sh create mode 100644 CI/slurm/slurm.conf create mode 100755 CI/slurm/start-slurm.sh create mode 100644 python/xorbits/cluster/Slurm.py create mode 100644 python/xorbits/cluster/__init__.py create mode 100644 python/xorbits/cluster/slurm-template.sh create mode 100644 python/xorbits/cluster/slurm.sh create mode 100644 python/xorbits/cluster/test.py diff --git a/.github/workflows/cluster.yaml b/.github/workflows/cluster.yaml new file mode 100644 index 000000000..a0b7295ad --- /dev/null +++ b/.github/workflows/cluster.yaml @@ -0,0 +1,60 @@ +name: CI + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + timeout-minutes: 30 + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + jobqueue: ["slurm"] + + steps: + - name: Cancel previous runs + uses: styfle/cancel-workflow-action@0.7.0 + with: + access_token: ${{ github.token }} + - name: Checkout source + uses: actions/checkout@v2 + + - name: Setup Empty Conda Environment with Mamba + if: matrix.jobqueue == 'none' + uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge + mamba-version: "*" + activate-environment: xorbits + auto-activate-base: false + + - name: Setup xorbits conda environment + if: matrix.jobqueue == 'none' + run: | + mamba env update -f CI/requirements-wheel.txt + mamba list + + - name: Setup Job queuing system + if: matrix.jobqueue != 'none' + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_before_install + + - name: Install xorbits + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_install + + - name: Test + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_script + + - name: Cleanup + if: always() + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_after_script \ No newline at end of file diff --git a/CI/slurm.sh b/CI/slurm.sh new file mode 100644 index 000000000..b25da4234 --- /dev/null +++ b/CI/slurm.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start slurm cluster + cd ./CI/slurm + docker-compose pull + ./start-slurm.sh + cd - + + #Set shared space permissions + docker exec slurmctld /bin/bash -c "chmod -R 777 /shared_space" + + docker ps -a + docker images + show_network_interfaces +} + +function show_network_interfaces { + for c in slurmctld c1 c2; do + echo '------------------------------------------------------------' + echo docker container: $c + docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' + echo '------------------------------------------------------------' + done +} + +function jobqueue_install { + docker exec slurmctld /bin/bash -c "cd xorbits/python/; pip install -e ." +} + +function jobqueue_script { + docker exec c1 /bin/bash -c "pip install xorbits" + docker exec c2 /bin/bash -c "pip install xorbits" +} + +function jobqueue_after_script { + docker exec slurmctld bash -c 'sinfo' + docker exec slurmctld bash -c 'squeue' + docker exec slurmctld bash -c 'sacct -l' +} diff --git a/CI/slurm/Dockerfile b/CI/slurm/Dockerfile new file mode 100644 index 000000000..1a57e7ccf --- /dev/null +++ b/CI/slurm/Dockerfile @@ -0,0 +1,2 @@ +FROM daskdev/dask-jobqueue:slurm +RUN pip install xorbits diff --git a/CI/slurm/docker-compose.yml b/CI/slurm/docker-compose.yml new file mode 100644 index 000000000..088f0e9e9 --- /dev/null +++ b/CI/slurm/docker-compose.yml @@ -0,0 +1,120 @@ +version: "2.2" + +services: + mysql: + image: mysql:5.7.29 + hostname: mysql + container_name: mysql + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: slurm_acct_db + MYSQL_USER: slurm + MYSQL_PASSWORD: password + volumes: + - var_lib_mysql:/var/lib/mysql + networks: + common-network: + + slurmdbd: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmdbd"] + container_name: slurmdbd + hostname: slurmdbd + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - var_log_slurm:/var/log/slurm + expose: + - "6819" + depends_on: + - mysql + networks: + common-network: + + slurmctld: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmctld"] + container_name: slurmctld + hostname: slurmctld + environment: + - CI_SHARED_SPACE=/shared_space + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - ../..:/xorbits + - shared_space:/shared_space + expose: + - "6817" + depends_on: + - "slurmdbd" + networks: + common-network: + ipv4_address: 10.1.1.10 + cap_add: + - NET_ADMIN + + c1: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmd"] + hostname: c1 + container_name: c1 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - ../..:/xorbits + - shared_space:/shared_space + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + common-network: + ipv4_address: 10.1.1.11 + cap_add: + - NET_ADMIN + + c2: + image: daskdev/dask-jobqueue:slurm + build: . + command: ["slurmd"] + hostname: c2 + container_name: c2 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + - ../..:/xorbits + - shared_space:/shared_space + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + common-network: + ipv4_address: 10.1.1.12 + cap_add: + - NET_ADMIN + +volumes: + etc_munge: + etc_slurm: + slurm_jobdir: + var_lib_mysql: + var_log_slurm: + shared_space: + +networks: + common-network: + driver: bridge + ipam: + driver: default + config: + - subnet: 10.1.1.0/24 diff --git a/CI/slurm/register_cluster.sh b/CI/slurm/register_cluster.sh new file mode 100755 index 000000000..ef3d4d0fb --- /dev/null +++ b/CI/slurm/register_cluster.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \ +docker-compose restart slurmdbd slurmctld diff --git a/CI/slurm/slurm.conf b/CI/slurm/slurm.conf new file mode 100644 index 000000000..0aad9f1b9 --- /dev/null +++ b/CI/slurm/slurm.conf @@ -0,0 +1,94 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=slurmctld +ControlAddr=slurmctld +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +# +# ACCOUNTING +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStoragePort=6819 +AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +#AccountingStorageUser= +# +# COMPUTE NODES +NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN +# +# PARTITIONS +PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP diff --git a/CI/slurm/start-slurm.sh b/CI/slurm/start-slurm.sh new file mode 100755 index 000000000..6cdce2db1 --- /dev/null +++ b/CI/slurm/start-slurm.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +docker-compose up -d --no-build + +while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] + do + echo "Waiting for SLURM cluster to become ready"; + sleep 2 + done +echo "SLURM properly configured" + +# On some clusters the login node does not have the same interface as the +# compute nodes. The next three lines allow to test this edge case by adding +# separate interfaces on the worker and on the scheduler nodes. +docker exec slurmctld ip addr add 10.1.1.20/24 dev eth0 label eth0:scheduler +docker exec c1 ip addr add 10.1.1.21/24 dev eth0 label eth0:worker +docker exec c2 ip addr add 10.1.1.22/24 dev eth0 label eth0:worker diff --git a/python/xorbits/cluster/Slurm.py b/python/xorbits/cluster/Slurm.py new file mode 100644 index 000000000..2f1a1bd0a --- /dev/null +++ b/python/xorbits/cluster/Slurm.py @@ -0,0 +1,174 @@ +import os +import subprocess +import time +import atexit +import xorbits +import xorbits.numpy as np + +DEFAULT_JOB_NAME = "default_job" +DEFAULT_NUMBER_NODES = 2 +DEFAULT_PARTITION_OPTION = "batch" +DEFAULT_LOAD_ENV = "LOAD_ENV" + +class SLURMCluster: + def __init__(self, + job_name=None, + num_nodes=None, + partition_option=None, + load_env=None, + output_dir=None, + error_dir=None, + work_dir=None, + walltime=None, + processes=None, + cores=None, + memory=None, + account=None,): + + commands = ["#!/bin/bash"] + + self.job_name = job_name + self.num_nodes = num_nodes + self.partition_option = partition_option + self.output_dir = output_dir + self.work_dir = work_dir + self.walltime = walltime + self.processes = processes + self.cores = cores + self.memory = memory + self.account = account + + slurm_params = { + "job-name": self.job_name, + "nodes": self.num_nodes, + "partition": self.partition_option, + "output": self.output_dir, + "workdir": self.work_dir, + "time": self.walltime, + "ntasks": self.processes, + "cpus-per-task": self.cores, + "mem": self.memory, + } + + for param, value in slurm_params.items(): + if value is not None: + commands.append(f"#SBATCH --{param}={value}") + + if self.load_env: + commands.append(f"source activate {self.load_env}") + + if self.queue: + commands.append(f"#SBATCH --partition={self.queue}") + + commands += [ + "set -x", + 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', + "nodes_array=($nodes)", + "head_node=${nodes_array[0]}", + "port=16380", + "web_port=16379", + 'echo "Starting SUPERVISOR at ${head_node}"', + 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', + ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" &', + "sleep 10", + 'worker_num=$((SLURM_JOB_NUM_NODES - 1))', + 'for ((i = 1; i <= worker_num; i++)); do', + ' node_i=${nodes_array[$i]}', + ' port_i=$((port + i))', + ' echo "Starting WORKER $i at ${node_i}"', + ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', + ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" &', + 'done', + "sleep 5", + 'address=http://"${head_node}":"${web_port}"', + ] + + return "\n".join(commands) + + def run(self): + shell_commands = self.shell_commands() + with open("slurm.sh", 'w') as f: + f.write(shell_commands) + + os.chmod("slurm.sh", 0o770) + + result = subprocess.run(["sbatch", "slurm.sh"], capture_output=True, text=True) + + if result.returncode == 0: + print("Job submitted successfully.") + self.job_id = self.get_job_id(result.stdout) + if self.job_id: + print(f"Job ID is {self.job_id}.") + atexit.register(self.cancel_job) + else: + print("Could not get job ID. Cleanup upon exit is not possible.") + return self.get_job_address() + else: + print("Job submission failed.") + print("Output:", result.stdout) + print("Errors:", result.stderr) + return None + + def get_job_id(self, sbatch_output): + job_id = None + for line in sbatch_output.split('\n'): + if "Submitted batch job" in line: + job_id = line.split(" ")[-1] + return job_id + + def cancel_job(self): + if self.job_id: + print(f"Cancelling job {self.job_id}") + subprocess.run(["scancel", self.job_id]) + + def update_head_node(self): + try: + if self.job_id: + command = ["scontrol", "show", "job", self.job_id] + result = subprocess.run(command, capture_output=True, text=True) + if result.returncode == 0: + output = result.stdout + print(output) + + for line in output.split('\n'): + if line.startswith("NodeList="): + node_list = line[len("NodeList="):].strip() + break + if node_list is None: + raise ValueError(f"Job {self.job_id} not found or NodeList information not available.") + + # 提取头节点(第一个节点) + self.head_node = node_list.split()[0] + except subprocess.CalledProcessError as e: + print(f"Error executing scontrol: {e}") + except ValueError as e: + print(f"Error: {e}") + except Exception as e: + print(f"An unexpected error occurred: {e}") + + self.head_node = None + print("Failed to retrieve head node.") + + def get_job_address(self, retry_attempts=10, sleep_interval=30): + # We retry several times to get job data + for attempt in range(retry_attempts): + try: + self.update_head_node() + if self.head_node is not None: + self.head_node = "eval" + address=f"http://{self.head_node}:{self.web_port}" + return address + else: + print(f"Attempt {attempt + 1} failed, retrying after {sleep_interval}s...") + time.sleep(sleep_interval) + except Exception as e: + print(str(e)) + +if __name__ == "__main__": + exp = SLURMCluster(job_name="xorbits",num_nodes=2,cores=2,time="00:30:00",processes=1) + adress = exp.run() + print(adress) + time.sleep(5) + adress = "http://c1:16379" + xorbits.init(adress) + print(np.random.rand(100, 100).mean()) \ No newline at end of file diff --git a/python/xorbits/cluster/__init__.py b/python/xorbits/cluster/__init__.py new file mode 100644 index 000000000..c319e6f0d --- /dev/null +++ b/python/xorbits/cluster/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .Slurm import SLURMCluster \ No newline at end of file diff --git a/python/xorbits/cluster/slurm-template.sh b/python/xorbits/cluster/slurm-template.sh new file mode 100644 index 000000000..d0cbfbf83 --- /dev/null +++ b/python/xorbits/cluster/slurm-template.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --job-name=xorbits +#SBATCH --nodes=2 +#SBATCH --cpus-per-task=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --partition=batch +#SBATCH --time=00:30:00 + + +source activate xorbits-dev + +### Use the debug mode to see if the shell commands are correct. +### If you do not want the shell command logs, delete the following line. +set -x + +# Getting the node names +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} +port=16380 +web_port=16379 + +echo "Starting SUPERVISOR at ${head_node}" +srun --nodes=1 --ntasks=1 -w "${head_node}" \ + xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + port_i=$((port + i)) + + echo "Starting WORKER $i at ${node_i}" + srun --nodes=1 --ntasks=1 -w "${node_i}" \ + xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & +done +sleep 5 + +address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/cluster/slurm.sh b/python/xorbits/cluster/slurm.sh new file mode 100644 index 000000000..6d62ab893 --- /dev/null +++ b/python/xorbits/cluster/slurm.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=default_job +#SBATCH --nodes=2 +#SBATCH --partition=batch +source activate LOAD_ENV +#SBATCH --output= /shared_space/output.out + +set -x +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) +head_node=${nodes_array[0]} +port=16380 +web_port=16379 +echo "Starting SUPERVISOR at ${head_node}" +srun --nodes=1 --ntasks=1 -w "${head_node}" \ + xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & +sleep 10 +worker_num=$((SLURM_JOB_NUM_NODES - 1)) +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + port_i=$((port + i)) + echo "Starting WORKER $i at ${node_i}" + srun --nodes=1 --ntasks=1 -w "${node_i}" \ + xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & +done +sleep 5 +address=http://"${head_node}":"${web_port}" \ No newline at end of file diff --git a/python/xorbits/cluster/test.py b/python/xorbits/cluster/test.py new file mode 100644 index 000000000..61070a827 --- /dev/null +++ b/python/xorbits/cluster/test.py @@ -0,0 +1,11 @@ +import xorbits +import xorbits.numpy as np +from .Slurm import SLURMCluster +import time + +exp = SLURMCluster() +adress = exp.run() +print(adress) +time.sleep(5) +xorbits.init("http://c1:16379") +print(np.random.rand(100, 100).mean()) \ No newline at end of file From 019c949e462201123b9455fd42c38d5531b6ebb4 Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Mon, 25 Sep 2023 14:42:03 +0800 Subject: [PATCH 02/59] support for slurm --- CI/slurm.sh | 2 + CI/slurm/slurm.conf | 11 +++-- python/xorbits/cluster/Slurm.py | 78 ++++++++++++++++----------------- python/xorbits/cluster/slurm.sh | 14 +++--- python/xorbits/cluster/test.py | 11 +++-- 5 files changed, 59 insertions(+), 57 deletions(-) mode change 100644 => 100755 python/xorbits/cluster/slurm.sh diff --git a/CI/slurm.sh b/CI/slurm.sh index b25da4234..f2dc86af8 100644 --- a/CI/slurm.sh +++ b/CI/slurm.sh @@ -34,6 +34,8 @@ function jobqueue_install { function jobqueue_script { docker exec c1 /bin/bash -c "pip install xorbits" docker exec c2 /bin/bash -c "pip install xorbits" + docker exec slurmctld /bin/bash -c "python /xorbits/python/xorbits/cluster/Slurm.py" + docker exec slurmctld /bin/bash -c "cat /shared_space/output.out" } function jobqueue_after_script { diff --git a/CI/slurm/slurm.conf b/CI/slurm/slurm.conf index 0aad9f1b9..7f63d1a9e 100644 --- a/CI/slurm/slurm.conf +++ b/CI/slurm/slurm.conf @@ -48,9 +48,14 @@ SlurmctldTimeout=300 SlurmdTimeout=300 InactiveLimit=0 MinJobAge=300 -KillWait=30 -Waittime=0 -# +KillWait=300 +Waittime=30 +#change this avoids low resource kill the process +#**log** +#srun: Job step aborted: Waiting up to 32 seconds for job step to finish. +#slurmstepd: error: *** STEP 27.0 ON c1 CANCELLED AT 2023-09-25T06:30:54 *** + + # SCHEDULING SchedulerType=sched/backfill #SchedulerAuth= diff --git a/python/xorbits/cluster/Slurm.py b/python/xorbits/cluster/Slurm.py index 2f1a1bd0a..e143f7cfa 100644 --- a/python/xorbits/cluster/Slurm.py +++ b/python/xorbits/cluster/Slurm.py @@ -19,7 +19,7 @@ def __init__(self, output_dir=None, error_dir=None, work_dir=None, - walltime=None, + time=None, processes=None, cores=None, memory=None, @@ -32,11 +32,13 @@ def __init__(self, self.partition_option = partition_option self.output_dir = output_dir self.work_dir = work_dir - self.walltime = walltime + self.walltime = time self.processes = processes self.cores = cores self.memory = memory self.account = account + self.load_env = load_env + self.commands = None slurm_params = { "job-name": self.job_name, @@ -57,36 +59,33 @@ def __init__(self, if self.load_env: commands.append(f"source activate {self.load_env}") - if self.queue: - commands.append(f"#SBATCH --partition={self.queue}") - - commands += [ - "set -x", - 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', - "nodes_array=($nodes)", - "head_node=${nodes_array[0]}", - "port=16380", - "web_port=16379", - 'echo "Starting SUPERVISOR at ${head_node}"', - 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', - ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" &', - "sleep 10", - 'worker_num=$((SLURM_JOB_NUM_NODES - 1))', - 'for ((i = 1; i <= worker_num; i++)); do', - ' node_i=${nodes_array[$i]}', - ' port_i=$((port + i))', - ' echo "Starting WORKER $i at ${node_i}"', - ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', - ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" &', - 'done', - "sleep 5", - 'address=http://"${head_node}":"${web_port}"', - ] - - return "\n".join(commands) + commands += [ + "set -x", + 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', + "nodes_array=($nodes)", + "head_node=${nodes_array[0]}", + "port=16380", + "web_port=16379", + 'echo "Starting SUPERVISOR at ${head_node}"', + 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', + ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"&', + "sleep 10", + 'worker_num=$((SLURM_JOB_NUM_NODES - 1))', + 'for ((i = 1; i <= worker_num; i++)); do', + ' node_i=${nodes_array[$i]}', + ' port_i=$((port + i))', + ' echo "Starting WORKER $i at ${node_i}"', + ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', + ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', + 'done', + "sleep 5", + 'address=http://"${head_node}":"${web_port}"', + ] + + self.commands = "\n".join(commands) def run(self): - shell_commands = self.shell_commands() + shell_commands = self.commands with open("slurm.sh", 'w') as f: f.write(shell_commands) @@ -126,19 +125,18 @@ def update_head_node(self): if self.job_id: command = ["scontrol", "show", "job", self.job_id] result = subprocess.run(command, capture_output=True, text=True) + node_list = None if result.returncode == 0: output = result.stdout print(output) - - for line in output.split('\n'): - if line.startswith("NodeList="): - node_list = line[len("NodeList="):].strip() - break - if node_list is None: - raise ValueError(f"Job {self.job_id} not found or NodeList information not available.") - + for line in output.split('\n'): + if line.startswith("NodeList="): + node_list = line[len("NodeList="):].strip() + break + if node_list is None: + raise ValueError(f"Job {self.job_id} not found or NodeList information not available.") # 提取头节点(第一个节点) - self.head_node = node_list.split()[0] + self.head_node = node_list.split()[0] except subprocess.CalledProcessError as e: print(f"Error executing scontrol: {e}") except ValueError as e: @@ -165,7 +163,7 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): print(str(e)) if __name__ == "__main__": - exp = SLURMCluster(job_name="xorbits",num_nodes=2,cores=2,time="00:30:00",processes=1) + exp = SLURMCluster(job_name="xorbits",num_nodes=1,output_dir="/shared_space/output.out",time="00:30:00") adress = exp.run() print(adress) time.sleep(5) diff --git a/python/xorbits/cluster/slurm.sh b/python/xorbits/cluster/slurm.sh old mode 100644 new mode 100755 index 6d62ab893..7a8151747 --- a/python/xorbits/cluster/slurm.sh +++ b/python/xorbits/cluster/slurm.sh @@ -1,10 +1,8 @@ #!/bin/bash -#SBATCH --job-name=default_job -#SBATCH --nodes=2 -#SBATCH --partition=batch -source activate LOAD_ENV -#SBATCH --output= /shared_space/output.out - +#SBATCH --job-name=xorbits +#SBATCH --nodes=1 +#SBATCH --output=/shared_space/output.out +#SBATCH --time=00:30:00 set -x nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") nodes_array=($nodes) @@ -14,14 +12,14 @@ web_port=16379 echo "Starting SUPERVISOR at ${head_node}" srun --nodes=1 --ntasks=1 -w "${head_node}" \ xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & -sleep 10 +sleep 20 worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} port_i=$((port + i)) echo "Starting WORKER $i at ${node_i}" srun --nodes=1 --ntasks=1 -w "${node_i}" \ - xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & + xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & done sleep 5 address=http://"${head_node}":"${web_port}" \ No newline at end of file diff --git a/python/xorbits/cluster/test.py b/python/xorbits/cluster/test.py index 61070a827..3623018e1 100644 --- a/python/xorbits/cluster/test.py +++ b/python/xorbits/cluster/test.py @@ -1,11 +1,10 @@ import xorbits import xorbits.numpy as np -from .Slurm import SLURMCluster import time -exp = SLURMCluster() -adress = exp.run() -print(adress) -time.sleep(5) +#exp = SLURMCluster() +#adress = exp.run() +#print(adress) +#time.sleep(5) xorbits.init("http://c1:16379") -print(np.random.rand(100, 100).mean()) \ No newline at end of file +print(np.random.rand(5, 5).mean()) \ No newline at end of file From a3b90b091e5a78c144d33e55ee77aa5e9ee0640b Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Mon, 25 Sep 2023 14:59:14 +0800 Subject: [PATCH 03/59] slurm for support --- CI/slurm/slurm.conf | 1 - 1 file changed, 1 deletion(-) diff --git a/CI/slurm/slurm.conf b/CI/slurm/slurm.conf index 7f63d1a9e..6f1fa21a5 100644 --- a/CI/slurm/slurm.conf +++ b/CI/slurm/slurm.conf @@ -55,7 +55,6 @@ Waittime=30 #srun: Job step aborted: Waiting up to 32 seconds for job step to finish. #slurmstepd: error: *** STEP 27.0 ON c1 CANCELLED AT 2023-09-25T06:30:54 *** - # SCHEDULING SchedulerType=sched/backfill #SchedulerAuth= From 527f34ff48f7291952b3ba44d140407c3ff06cf5 Mon Sep 17 00:00:00 2001 From: liddle_rain Date: Mon, 25 Sep 2023 17:58:17 +0000 Subject: [PATCH 04/59] support for slurm --- python/xorbits/cluster/Slurm.py | 43 +++++++++++++++--------- python/xorbits/cluster/slurm-template.sh | 6 ++-- python/xorbits/cluster/slurm.sh | 10 +++--- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/python/xorbits/cluster/Slurm.py b/python/xorbits/cluster/Slurm.py index e143f7cfa..087fae536 100644 --- a/python/xorbits/cluster/Slurm.py +++ b/python/xorbits/cluster/Slurm.py @@ -4,6 +4,7 @@ import atexit import xorbits import xorbits.numpy as np +import re DEFAULT_JOB_NAME = "default_job" DEFAULT_NUMBER_NODES = 2 @@ -23,7 +24,8 @@ def __init__(self, processes=None, cores=None, memory=None, - account=None,): + account=None, + webport=16379): commands = ["#!/bin/bash"] @@ -39,7 +41,7 @@ def __init__(self, self.account = account self.load_env = load_env self.commands = None - + self.web_port = webport slurm_params = { "job-name": self.job_name, "nodes": self.num_nodes, @@ -65,11 +67,11 @@ def __init__(self, "nodes_array=($nodes)", "head_node=${nodes_array[0]}", "port=16380", - "web_port=16379", + f"web_port={self.web_port}", 'echo "Starting SUPERVISOR at ${head_node}"', 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"&', - "sleep 10", + "sleep 30", 'worker_num=$((SLURM_JOB_NUM_NODES - 1))', 'for ((i = 1; i <= worker_num; i++)); do', ' node_i=${nodes_array[$i]}', @@ -78,7 +80,7 @@ def __init__(self, ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', 'done', - "sleep 5", + "sleep 30", 'address=http://"${head_node}":"${web_port}"', ] @@ -123,20 +125,32 @@ def cancel_job(self): def update_head_node(self): try: if self.job_id: + time.sleep(5) command = ["scontrol", "show", "job", self.job_id] result = subprocess.run(command, capture_output=True, text=True) node_list = None if result.returncode == 0: - output = result.stdout - print(output) - for line in output.split('\n'): - if line.startswith("NodeList="): - node_list = line[len("NodeList="):].strip() - break + job_info = result.stdout + node_list_pattern = r'NodeList=(c\[\d+-\d+\]|c\d)' + matches = re.search(node_list_pattern, job_info) + + if matches: + node_list = matches.group(1) + print("NodeList:", node_list) if node_list is None: raise ValueError(f"Job {self.job_id} not found or NodeList information not available.") - # 提取头节点(第一个节点) - self.head_node = node_list.split()[0] + # get_head_node from nodelist + if "[" in node_list: + head_node = node_list.split("-")[0].replace("[","") + else: + # when only one node + head_node = node_list + + self.head_node = head_node + return head_node + else: + print("NodeList not found in the string.") + except subprocess.CalledProcessError as e: print(f"Error executing scontrol: {e}") except ValueError as e: @@ -153,7 +167,6 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): try: self.update_head_node() if self.head_node is not None: - self.head_node = "eval" address=f"http://{self.head_node}:{self.web_port}" return address else: @@ -163,7 +176,7 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): print(str(e)) if __name__ == "__main__": - exp = SLURMCluster(job_name="xorbits",num_nodes=1,output_dir="/shared_space/output.out",time="00:30:00") + exp = SLURMCluster(job_name="xorbits",num_nodes=2,output_dir="/shared_space/output.out",time="00:30:00") adress = exp.run() print(adress) time.sleep(5) diff --git a/python/xorbits/cluster/slurm-template.sh b/python/xorbits/cluster/slurm-template.sh index d0cbfbf83..4cb90b5b3 100644 --- a/python/xorbits/cluster/slurm-template.sh +++ b/python/xorbits/cluster/slurm-template.sh @@ -3,8 +3,8 @@ #SBATCH --nodes=2 #SBATCH --cpus-per-task=2 #SBATCH --ntasks-per-node=1 -#SBATCH --partition=batch #SBATCH --time=00:30:00 +#SBATCH --output=/shared_space/output.out source activate xorbits-dev @@ -24,7 +24,7 @@ web_port=16379 echo "Starting SUPERVISOR at ${head_node}" srun --nodes=1 --ntasks=1 -w "${head_node}" \ xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & -sleep 10 +sleep 30 # number of nodes other than the head node worker_num=$((SLURM_JOB_NUM_NODES - 1)) @@ -37,6 +37,6 @@ for ((i = 1; i <= worker_num; i++)); do srun --nodes=1 --ntasks=1 -w "${node_i}" \ xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & done -sleep 5 +sleep 30 address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/cluster/slurm.sh b/python/xorbits/cluster/slurm.sh index 7a8151747..756d11137 100755 --- a/python/xorbits/cluster/slurm.sh +++ b/python/xorbits/cluster/slurm.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name=xorbits -#SBATCH --nodes=1 +#SBATCH --nodes=2 #SBATCH --output=/shared_space/output.out #SBATCH --time=00:30:00 set -x @@ -11,15 +11,15 @@ port=16380 web_port=16379 echo "Starting SUPERVISOR at ${head_node}" srun --nodes=1 --ntasks=1 -w "${head_node}" \ - xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & -sleep 20 + xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"& +sleep 30 worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} port_i=$((port + i)) echo "Starting WORKER $i at ${node_i}" srun --nodes=1 --ntasks=1 -w "${node_i}" \ - xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & + xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& done -sleep 5 +sleep 30 address=http://"${head_node}":"${web_port}" \ No newline at end of file From e61b814e575ba38c9589e9102bebc4a3814a73b9 Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Tue, 26 Sep 2023 02:56:24 +0800 Subject: [PATCH 05/59] fix pre-commit --- .pre-commit-config.yaml | 6 +- python/xorbits/cluster/Slurm.py | 127 ++++++++++++++++------------- python/xorbits/cluster/__init__.py | 2 +- python/xorbits/cluster/slurm.sh | 2 +- python/xorbits/cluster/test.py | 11 ++- 5 files changed, 78 insertions(+), 70 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7239dfcb..910052563 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,11 +26,7 @@ repos: additional_dependencies: [tokenize-rt==3.2.0] exclude: _mars args: [--ignore-missing-imports, --follow-imports, skip] - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.0 # Use the sha or tag you want to point at - hooks: - - id: prettier - types_or: [html, javascript] + - repo: https://github.com/codespell-project/codespell rev: v2.2.5 hooks: diff --git a/python/xorbits/cluster/Slurm.py b/python/xorbits/cluster/Slurm.py index 087fae536..9bd58ae81 100644 --- a/python/xorbits/cluster/Slurm.py +++ b/python/xorbits/cluster/Slurm.py @@ -1,32 +1,35 @@ +import atexit import os +import re import subprocess import time -import atexit + import xorbits import xorbits.numpy as np -import re DEFAULT_JOB_NAME = "default_job" DEFAULT_NUMBER_NODES = 2 DEFAULT_PARTITION_OPTION = "batch" DEFAULT_LOAD_ENV = "LOAD_ENV" -class SLURMCluster: - def __init__(self, - job_name=None, - num_nodes=None, - partition_option=None, - load_env=None, - output_dir=None, - error_dir=None, - work_dir=None, - time=None, - processes=None, - cores=None, - memory=None, - account=None, - webport=16379): +class SLURMCluster: + def __init__( + self, + job_name=None, + num_nodes=None, + partition_option=None, + load_env=None, + output_dir=None, + error_dir=None, + work_dir=None, + time=None, + processes=None, + cores=None, + memory=None, + account=None, + webport=16379, + ): commands = ["#!/bin/bash"] self.job_name = job_name @@ -61,34 +64,34 @@ def __init__(self, if self.load_env: commands.append(f"source activate {self.load_env}") - commands += [ - "set -x", - 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', - "nodes_array=($nodes)", - "head_node=${nodes_array[0]}", - "port=16380", - f"web_port={self.web_port}", - 'echo "Starting SUPERVISOR at ${head_node}"', - 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', - ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"&', - "sleep 30", - 'worker_num=$((SLURM_JOB_NUM_NODES - 1))', - 'for ((i = 1; i <= worker_num; i++)); do', - ' node_i=${nodes_array[$i]}', - ' port_i=$((port + i))', - ' echo "Starting WORKER $i at ${node_i}"', - ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', - ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', - 'done', - "sleep 30", - 'address=http://"${head_node}":"${web_port}"', - ] + commands += [ + "set -x", + 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', + "nodes_array=($nodes)", + "head_node=${nodes_array[0]}", + "port=16380", + f"web_port={self.web_port}", + 'echo "Starting SUPERVISOR at ${head_node}"', + 'srun --nodes=1 --ntasks=1 -w "${head_node}" \\', + ' xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"&', + "sleep 30", + "worker_num=$((SLURM_JOB_NUM_NODES - 1))", + "for ((i = 1; i <= worker_num; i++)); do", + " node_i=${nodes_array[$i]}", + " port_i=$((port + i))", + ' echo "Starting WORKER $i at ${node_i}"', + ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', + ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', + "done", + "sleep 30", + 'address=http://"${head_node}":"${web_port}"', + ] self.commands = "\n".join(commands) def run(self): shell_commands = self.commands - with open("slurm.sh", 'w') as f: + with open("slurm.sh", "w") as f: f.write(shell_commands) os.chmod("slurm.sh", 0o770) @@ -112,7 +115,7 @@ def run(self): def get_job_id(self, sbatch_output): job_id = None - for line in sbatch_output.split('\n'): + for line in sbatch_output.split("\n"): if "Submitted batch job" in line: job_id = line.split(" ")[-1] return job_id @@ -131,55 +134,65 @@ def update_head_node(self): node_list = None if result.returncode == 0: job_info = result.stdout - node_list_pattern = r'NodeList=(c\[\d+-\d+\]|c\d)' + node_list_pattern = r"NodeList=(c\[\d+-\d+\]|c\d)" matches = re.search(node_list_pattern, job_info) if matches: node_list = matches.group(1) - print("NodeList:", node_list) + print("NodeList:", node_list) if node_list is None: - raise ValueError(f"Job {self.job_id} not found or NodeList information not available.") + raise ValueError( + f"Job {self.job_id} not found or NodeList information not available." + ) # get_head_node from nodelist if "[" in node_list: - head_node = node_list.split("-")[0].replace("[","") + head_node = node_list.split("-")[0].replace("[", "") else: - # when only one node + # when only one node head_node = node_list self.head_node = head_node return head_node else: print("NodeList not found in the string.") - + except subprocess.CalledProcessError as e: print(f"Error executing scontrol: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"An unexpected error occurred: {e}") - + self.head_node = None print("Failed to retrieve head node.") - + def get_job_address(self, retry_attempts=10, sleep_interval=30): # We retry several times to get job data for attempt in range(retry_attempts): try: self.update_head_node() if self.head_node is not None: - address=f"http://{self.head_node}:{self.web_port}" + address = f"http://{self.head_node}:{self.web_port}" return address else: - print(f"Attempt {attempt + 1} failed, retrying after {sleep_interval}s...") + print( + f"Attempt {attempt + 1} failed, retrying after {sleep_interval}s..." + ) time.sleep(sleep_interval) except Exception as e: print(str(e)) + if __name__ == "__main__": - exp = SLURMCluster(job_name="xorbits",num_nodes=2,output_dir="/shared_space/output.out",time="00:30:00") - adress = exp.run() - print(adress) + exp = SLURMCluster( + job_name="xorbits", + num_nodes=2, + output_dir="/shared_space/output.out", + time="00:30:00", + ) + address = exp.run() + print(address) time.sleep(5) - adress = "http://c1:16379" - xorbits.init(adress) - print(np.random.rand(100, 100).mean()) \ No newline at end of file + address = "http://c1:16379" + xorbits.init(address) + print(np.random.rand(100, 100).mean()) diff --git a/python/xorbits/cluster/__init__.py b/python/xorbits/cluster/__init__.py index c319e6f0d..61215ec3f 100644 --- a/python/xorbits/cluster/__init__.py +++ b/python/xorbits/cluster/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .Slurm import SLURMCluster \ No newline at end of file +from .Slurm import SLURMCluster diff --git a/python/xorbits/cluster/slurm.sh b/python/xorbits/cluster/slurm.sh index 756d11137..4e44a9715 100755 --- a/python/xorbits/cluster/slurm.sh +++ b/python/xorbits/cluster/slurm.sh @@ -22,4 +22,4 @@ for ((i = 1; i <= worker_num; i++)); do xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& done sleep 30 -address=http://"${head_node}":"${web_port}" \ No newline at end of file +address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/cluster/test.py b/python/xorbits/cluster/test.py index 3623018e1..77db33eea 100644 --- a/python/xorbits/cluster/test.py +++ b/python/xorbits/cluster/test.py @@ -1,10 +1,9 @@ import xorbits import xorbits.numpy as np -import time -#exp = SLURMCluster() -#adress = exp.run() -#print(adress) -#time.sleep(5) +# exp = SLURMCluster() +# address = exp.run() +# print(address) +# time.sleep(5) xorbits.init("http://c1:16379") -print(np.random.rand(5, 5).mean()) \ No newline at end of file +print(np.random.rand(5, 5).mean()) From dc237e1240ef7427e6a16dd7624d20207277bac9 Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Tue, 26 Sep 2023 02:59:35 +0800 Subject: [PATCH 06/59] support for slurm --- .pre-commit-config.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 910052563..a7239dfcb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,11 @@ repos: additional_dependencies: [tokenize-rt==3.2.0] exclude: _mars args: [--ignore-missing-imports, --follow-imports, skip] - + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0 # Use the sha or tag you want to point at + hooks: + - id: prettier + types_or: [html, javascript] - repo: https://github.com/codespell-project/codespell rev: v2.2.5 hooks: From a8f04469f5108b7e4785362f1f40b8716580edd3 Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Tue, 26 Sep 2023 03:04:55 +0800 Subject: [PATCH 07/59] remove the cluster --- .gitignore | 2 +- .pre-commit-config.yaml | 5 ----- CI/slurm.sh | 2 +- python/xorbits/{ => deploy}/cluster/Slurm.py | 0 python/xorbits/{ => deploy}/cluster/__init__.py | 0 python/xorbits/{ => deploy}/cluster/slurm-template.sh | 0 python/xorbits/{ => deploy}/cluster/slurm.sh | 0 python/xorbits/{ => deploy}/cluster/test.py | 0 8 files changed, 2 insertions(+), 7 deletions(-) rename python/xorbits/{ => deploy}/cluster/Slurm.py (100%) rename python/xorbits/{ => deploy}/cluster/__init__.py (100%) rename python/xorbits/{ => deploy}/cluster/slurm-template.sh (100%) rename python/xorbits/{ => deploy}/cluster/slurm.sh (100%) rename python/xorbits/{ => deploy}/cluster/test.py (100%) diff --git a/.gitignore b/.gitignore index 602cc2a19..37e6ca30f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ __pycache__/ # C extensions *.so - +.pre-commit-config.yaml # Distribution / packaging .Python build/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7239dfcb..a750e6668 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,11 +26,6 @@ repos: additional_dependencies: [tokenize-rt==3.2.0] exclude: _mars args: [--ignore-missing-imports, --follow-imports, skip] - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.0 # Use the sha or tag you want to point at - hooks: - - id: prettier - types_or: [html, javascript] - repo: https://github.com/codespell-project/codespell rev: v2.2.5 hooks: diff --git a/CI/slurm.sh b/CI/slurm.sh index f2dc86af8..c6ff3628a 100644 --- a/CI/slurm.sh +++ b/CI/slurm.sh @@ -34,7 +34,7 @@ function jobqueue_install { function jobqueue_script { docker exec c1 /bin/bash -c "pip install xorbits" docker exec c2 /bin/bash -c "pip install xorbits" - docker exec slurmctld /bin/bash -c "python /xorbits/python/xorbits/cluster/Slurm.py" + docker exec slurmctld /bin/bash -c "python /xorbits/python/xorbits/deploy/cluster/Slurm.py" docker exec slurmctld /bin/bash -c "cat /shared_space/output.out" } diff --git a/python/xorbits/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py similarity index 100% rename from python/xorbits/cluster/Slurm.py rename to python/xorbits/deploy/cluster/Slurm.py diff --git a/python/xorbits/cluster/__init__.py b/python/xorbits/deploy/cluster/__init__.py similarity index 100% rename from python/xorbits/cluster/__init__.py rename to python/xorbits/deploy/cluster/__init__.py diff --git a/python/xorbits/cluster/slurm-template.sh b/python/xorbits/deploy/cluster/slurm-template.sh similarity index 100% rename from python/xorbits/cluster/slurm-template.sh rename to python/xorbits/deploy/cluster/slurm-template.sh diff --git a/python/xorbits/cluster/slurm.sh b/python/xorbits/deploy/cluster/slurm.sh similarity index 100% rename from python/xorbits/cluster/slurm.sh rename to python/xorbits/deploy/cluster/slurm.sh diff --git a/python/xorbits/cluster/test.py b/python/xorbits/deploy/cluster/test.py similarity index 100% rename from python/xorbits/cluster/test.py rename to python/xorbits/deploy/cluster/test.py From b375fefcf9c1400423805146967de3656633b3ab Mon Sep 17 00:00:00 2001 From: Aprilies <767760161@qq.com> Date: Tue, 26 Sep 2023 07:55:07 +0800 Subject: [PATCH 08/59] logging instead of print --- python/xorbits/deploy/cluster/Slurm.py | 39 +++++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index 9bd58ae81..aa5551bd6 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -1,4 +1,5 @@ import atexit +import logging import os import re import subprocess @@ -12,6 +13,10 @@ DEFAULT_PARTITION_OPTION = "batch" DEFAULT_LOAD_ENV = "LOAD_ENV" +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + class SLURMCluster: def __init__( @@ -99,18 +104,18 @@ def run(self): result = subprocess.run(["sbatch", "slurm.sh"], capture_output=True, text=True) if result.returncode == 0: - print("Job submitted successfully.") + logger.info("Job submitted successfully.") self.job_id = self.get_job_id(result.stdout) if self.job_id: - print(f"Job ID is {self.job_id}.") + logger.info(f"Job ID is {self.job_id}.") atexit.register(self.cancel_job) else: - print("Could not get job ID. Cleanup upon exit is not possible.") + logger.error("Could not get job ID. Cleanup upon exit is not possible.") return self.get_job_address() else: - print("Job submission failed.") - print("Output:", result.stdout) - print("Errors:", result.stderr) + logger.error("Job submission failed.") + logger.error("Output:", result.stdout) + logger.error("Errors:", result.stderr) return None def get_job_id(self, sbatch_output): @@ -122,7 +127,7 @@ def get_job_id(self, sbatch_output): def cancel_job(self): if self.job_id: - print(f"Cancelling job {self.job_id}") + logger.info(f"Cancelling job {self.job_id}") subprocess.run(["scancel", self.job_id]) def update_head_node(self): @@ -139,7 +144,7 @@ def update_head_node(self): if matches: node_list = matches.group(1) - print("NodeList:", node_list) + logger.info("NodeList:", node_list) if node_list is None: raise ValueError( f"Job {self.job_id} not found or NodeList information not available." @@ -154,17 +159,17 @@ def update_head_node(self): self.head_node = head_node return head_node else: - print("NodeList not found in the string.") + logger.warning("NodeList not found in the string.") except subprocess.CalledProcessError as e: - print(f"Error executing scontrol: {e}") + logger.error(f"Error executing scontrol: {e}") except ValueError as e: - print(f"Error: {e}") + logger.error(f"Error: {e}") except Exception as e: - print(f"An unexpected error occurred: {e}") + logger.error(f"An unexpected error occurred: {e}") self.head_node = None - print("Failed to retrieve head node.") + logger.warning("Failed to retrieve head node.") def get_job_address(self, retry_attempts=10, sleep_interval=30): # We retry several times to get job data @@ -175,12 +180,12 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): address = f"http://{self.head_node}:{self.web_port}" return address else: - print( + logger.warning( f"Attempt {attempt + 1} failed, retrying after {sleep_interval}s..." ) time.sleep(sleep_interval) except Exception as e: - print(str(e)) + logger.error(str(e)) if __name__ == "__main__": @@ -191,8 +196,8 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): time="00:30:00", ) address = exp.run() - print(address) + logger.info(address) time.sleep(5) address = "http://c1:16379" xorbits.init(address) - print(np.random.rand(100, 100).mean()) + logger.info(np.random.rand(100, 100).mean()) From 6488731e8adfbe8fe43e3270290071b1c96bf69a Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 08:16:32 +0800 Subject: [PATCH 09/59] change for default --- python/xorbits/deploy/cluster/Slurm.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index aa5551bd6..ac37d8f47 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -8,11 +8,6 @@ import xorbits import xorbits.numpy as np -DEFAULT_JOB_NAME = "default_job" -DEFAULT_NUMBER_NODES = 2 -DEFAULT_PARTITION_OPTION = "batch" -DEFAULT_LOAD_ENV = "LOAD_ENV" - # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) From 253640591c36acef66e5aebe50ae082e645fac7e Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 09:18:28 +0800 Subject: [PATCH 10/59] support for slurm pytest --- python/xorbits/deploy/cluster/Slurm.py | 21 ++++++++++++++------- python/xorbits/tests/test_slurm.py | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 python/xorbits/tests/test_slurm.py diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index ac37d8f47..806313c99 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -24,6 +24,7 @@ def __init__( error_dir=None, work_dir=None, time=None, + walltime=None, processes=None, cores=None, memory=None, @@ -37,16 +38,16 @@ def __init__( self.partition_option = partition_option self.output_dir = output_dir self.work_dir = work_dir - self.walltime = time + self.walltime = walltime + self.time = time self.processes = processes self.cores = cores self.memory = memory - self.account = account self.load_env = load_env - self.commands = None - self.web_port = webport + self.error_dir = error_dir + self.account = account slurm_params = { - "job-name": self.job_name, + "J": self.job_name, "nodes": self.num_nodes, "partition": self.partition_option, "output": self.output_dir, @@ -55,11 +56,17 @@ def __init__( "ntasks": self.processes, "cpus-per-task": self.cores, "mem": self.memory, + "t": self.time, + "A": self.account, } - + self.commands = None + self.web_port = webport for param, value in slurm_params.items(): if value is not None: - commands.append(f"#SBATCH --{param}={value}") + if len(value) > 1: + commands.append(f"#SBATCH --{param}={value}") + else: + commands.append(f"#SBATCH -{param}={value}") if self.load_env: commands.append(f"source activate {self.load_env}") diff --git a/python/xorbits/tests/test_slurm.py b/python/xorbits/tests/test_slurm.py new file mode 100644 index 000000000..bd1ea1bc9 --- /dev/null +++ b/python/xorbits/tests/test_slurm.py @@ -0,0 +1,25 @@ +from ..deploy.cluster import SLURMCluster + + +def test_header(): + with SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") as cluster: + assert "#SBATCH" in cluster.commands + assert "#SBATCH -n 1" in cluster.commands + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem28G" in cluster.commands + assert "#SBATCH -t 00:02:00" in cluster.commands + assert "#SBATCH -p" not in cluster.commands + # assert "#SBATCH -A" not in cluster.commands + + with SLURMCluster( + queue="regular", + account="XorbitsOnSlurm", + processes=4, + cores=8, + memory="28G", + ) as cluster: + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem=28G" in cluster.commands + assert "#SBATCH -t " in cluster.commands + assert "#SBATCH -A XorbitsOnSlurm" in cluster.commands + assert "#SBATCH --partion regular" in cluster.commands From 8c18a61b49e95400a26d2daf63724ff08cb609b3 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 09:38:40 +0800 Subject: [PATCH 11/59] support for slurm pytestgi --- python/xorbits/deploy/cluster/Slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index 806313c99..2211192f3 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -63,7 +63,7 @@ def __init__( self.web_port = webport for param, value in slurm_params.items(): if value is not None: - if len(value) > 1: + if len(str(value)) > 1: commands.append(f"#SBATCH --{param}={value}") else: commands.append(f"#SBATCH -{param}={value}") From 8961ad1b436ed201baded378849b58cdda7c98a9 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 10:07:10 +0800 Subject: [PATCH 12/59] support for slurm --- python/xorbits/deploy/cluster/Slurm.py | 16 +++++++--------- python/xorbits/deploy/cluster/slurm.sh | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index 2211192f3..1f08d092f 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -52,18 +52,17 @@ def __init__( "partition": self.partition_option, "output": self.output_dir, "workdir": self.work_dir, - "time": self.walltime, + "time": self.time, "ntasks": self.processes, "cpus-per-task": self.cores, "mem": self.memory, - "t": self.time, "A": self.account, } self.commands = None self.web_port = webport for param, value in slurm_params.items(): if value is not None: - if len(str(value)) > 1: + if len(str(param)) > 1: commands.append(f"#SBATCH --{param}={value}") else: commands.append(f"#SBATCH -{param}={value}") @@ -90,7 +89,7 @@ def __init__( ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', "done", - "sleep 30", + "sleep 100", 'address=http://"${head_node}":"${web_port}"', ] @@ -116,8 +115,9 @@ def run(self): return self.get_job_address() else: logger.error("Job submission failed.") - logger.error("Output:", result.stdout) - logger.error("Errors:", result.stderr) + logger.error("Output: {}".format(result.stdout)) + logger.error("Errors: {}".format(result.stderr)) + return None def get_job_id(self, sbatch_output): @@ -146,7 +146,7 @@ def update_head_node(self): if matches: node_list = matches.group(1) - logger.info("NodeList:", node_list) + logger.info(f"NodeList:{node_list}") if node_list is None: raise ValueError( f"Job {self.job_id} not found or NodeList information not available." @@ -199,7 +199,5 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): ) address = exp.run() logger.info(address) - time.sleep(5) - address = "http://c1:16379" xorbits.init(address) logger.info(np.random.rand(100, 100).mean()) diff --git a/python/xorbits/deploy/cluster/slurm.sh b/python/xorbits/deploy/cluster/slurm.sh index 4e44a9715..c20699f49 100755 --- a/python/xorbits/deploy/cluster/slurm.sh +++ b/python/xorbits/deploy/cluster/slurm.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=xorbits +#SBATCH -J=xorbits #SBATCH --nodes=2 #SBATCH --output=/shared_space/output.out #SBATCH --time=00:30:00 @@ -21,5 +21,5 @@ for ((i = 1; i <= worker_num; i++)); do srun --nodes=1 --ntasks=1 -w "${node_i}" \ xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& done -sleep 30 +sleep 100 address=http://"${head_node}":"${web_port}" From c63232c2f24289c247319c08f5f572688c9c1d48 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 10:31:14 +0800 Subject: [PATCH 13/59] modified: slurm.sh modified: ../python/xorbits/deploy/cluster/Slurm.py modified: ../python/xorbits/deploy/cluster/slurm.sh --- CI/slurm.sh | 2 +- python/xorbits/deploy/cluster/Slurm.py | 6 +++--- python/xorbits/deploy/cluster/slurm.sh | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CI/slurm.sh b/CI/slurm.sh index c6ff3628a..962123d07 100644 --- a/CI/slurm.sh +++ b/CI/slurm.sh @@ -42,4 +42,4 @@ function jobqueue_after_script { docker exec slurmctld bash -c 'sinfo' docker exec slurmctld bash -c 'squeue' docker exec slurmctld bash -c 'sacct -l' -} +} \ No newline at end of file diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index 1f08d092f..dfb3a4586 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -65,7 +65,7 @@ def __init__( if len(str(param)) > 1: commands.append(f"#SBATCH --{param}={value}") else: - commands.append(f"#SBATCH -{param}={value}") + commands.append(f"#SBATCH -{param} {value}") if self.load_env: commands.append(f"source activate {self.load_env}") @@ -89,7 +89,7 @@ def __init__( ' srun --nodes=1 --ntasks=1 -w "${node_i}" \\', ' xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"&', "done", - "sleep 100", + "sleep 300", 'address=http://"${head_node}":"${web_port}"', ] @@ -200,4 +200,4 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): address = exp.run() logger.info(address) xorbits.init(address) - logger.info(np.random.rand(100, 100).mean()) + test = np.random.rand(100, 100).mean() diff --git a/python/xorbits/deploy/cluster/slurm.sh b/python/xorbits/deploy/cluster/slurm.sh index c20699f49..b41bcdc5a 100755 --- a/python/xorbits/deploy/cluster/slurm.sh +++ b/python/xorbits/deploy/cluster/slurm.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH -J=xorbits +#SBATCH -J xorbits #SBATCH --nodes=2 #SBATCH --output=/shared_space/output.out #SBATCH --time=00:30:00 @@ -21,5 +21,5 @@ for ((i = 1; i <= worker_num; i++)); do srun --nodes=1 --ntasks=1 -w "${node_i}" \ xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& done -sleep 100 +sleep 300 address=http://"${head_node}":"${web_port}" From 391b6faa153f5e215d4d19302d6e9b86a10bb91c Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Tue, 26 Sep 2023 10:46:39 +0800 Subject: [PATCH 14/59] modified: ../python/xorbits/deploy/cluster/Slurm.py --- python/xorbits/deploy/cluster/Slurm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/Slurm.py index dfb3a4586..f8d23124d 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/Slurm.py @@ -201,3 +201,4 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): logger.info(address) xorbits.init(address) test = np.random.rand(100, 100).mean() + logging.info(f"test_result_{test}") From 10ccff24d39720d95ce9c211ff575a3077eea39a Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Fri, 29 Sep 2023 10:16:05 +0800 Subject: [PATCH 15/59] support for slurm --- .gitignore | 1 - python/xorbits/deploy/cluster/__init__.py | 2 +- .../xorbits/deploy/cluster/slurm-template.sh | 42 --------------- .../deploy/cluster/{Slurm.py => slurm.py} | 30 +++++------ python/xorbits/deploy/cluster/slurm.sh | 25 --------- python/xorbits/deploy/cluster/test.py | 9 ---- .../xorbits/deploy/cluster/tests/__init__.py | 13 +++++ .../deploy/cluster/tests/test_slurm.py | 54 +++++++++++++++++++ python/xorbits/tests/test_slurm.py | 25 --------- 9 files changed, 81 insertions(+), 120 deletions(-) delete mode 100644 python/xorbits/deploy/cluster/slurm-template.sh rename python/xorbits/deploy/cluster/{Slurm.py => slurm.py} (91%) delete mode 100755 python/xorbits/deploy/cluster/slurm.sh delete mode 100644 python/xorbits/deploy/cluster/test.py create mode 100644 python/xorbits/deploy/cluster/tests/__init__.py create mode 100644 python/xorbits/deploy/cluster/tests/test_slurm.py delete mode 100644 python/xorbits/tests/test_slurm.py diff --git a/.gitignore b/.gitignore index 37e6ca30f..b0b6eac07 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__/ # C extensions *.so -.pre-commit-config.yaml # Distribution / packaging .Python build/ diff --git a/python/xorbits/deploy/cluster/__init__.py b/python/xorbits/deploy/cluster/__init__.py index 61215ec3f..13bda3f82 100644 --- a/python/xorbits/deploy/cluster/__init__.py +++ b/python/xorbits/deploy/cluster/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .Slurm import SLURMCluster +from .slurm import SLURMCluster diff --git a/python/xorbits/deploy/cluster/slurm-template.sh b/python/xorbits/deploy/cluster/slurm-template.sh deleted file mode 100644 index 4cb90b5b3..000000000 --- a/python/xorbits/deploy/cluster/slurm-template.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=xorbits -#SBATCH --nodes=2 -#SBATCH --cpus-per-task=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --time=00:30:00 -#SBATCH --output=/shared_space/output.out - - -source activate xorbits-dev - -### Use the debug mode to see if the shell commands are correct. -### If you do not want the shell command logs, delete the following line. -set -x - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} -port=16380 -web_port=16379 - -echo "Starting SUPERVISOR at ${head_node}" -srun --nodes=1 --ntasks=1 -w "${head_node}" \ - xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}" & -sleep 30 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - port_i=$((port + i)) - - echo "Starting WORKER $i at ${node_i}" - srun --nodes=1 --ntasks=1 -w "${node_i}" \ - xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}" & -done -sleep 30 - -address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/deploy/cluster/Slurm.py b/python/xorbits/deploy/cluster/slurm.py similarity index 91% rename from python/xorbits/deploy/cluster/Slurm.py rename to python/xorbits/deploy/cluster/slurm.py index f8d23124d..756a56d0f 100644 --- a/python/xorbits/deploy/cluster/Slurm.py +++ b/python/xorbits/deploy/cluster/slurm.py @@ -1,3 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import atexit import logging import os @@ -5,9 +18,6 @@ import subprocess import time -import xorbits -import xorbits.numpy as np - # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -188,17 +198,3 @@ def get_job_address(self, retry_attempts=10, sleep_interval=30): time.sleep(sleep_interval) except Exception as e: logger.error(str(e)) - - -if __name__ == "__main__": - exp = SLURMCluster( - job_name="xorbits", - num_nodes=2, - output_dir="/shared_space/output.out", - time="00:30:00", - ) - address = exp.run() - logger.info(address) - xorbits.init(address) - test = np.random.rand(100, 100).mean() - logging.info(f"test_result_{test}") diff --git a/python/xorbits/deploy/cluster/slurm.sh b/python/xorbits/deploy/cluster/slurm.sh deleted file mode 100755 index b41bcdc5a..000000000 --- a/python/xorbits/deploy/cluster/slurm.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH -J xorbits -#SBATCH --nodes=2 -#SBATCH --output=/shared_space/output.out -#SBATCH --time=00:30:00 -set -x -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) -head_node=${nodes_array[0]} -port=16380 -web_port=16379 -echo "Starting SUPERVISOR at ${head_node}" -srun --nodes=1 --ntasks=1 -w "${head_node}" \ - xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"& -sleep 30 -worker_num=$((SLURM_JOB_NUM_NODES - 1)) -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - port_i=$((port + i)) - echo "Starting WORKER $i at ${node_i}" - srun --nodes=1 --ntasks=1 -w "${node_i}" \ - xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& -done -sleep 300 -address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/deploy/cluster/test.py b/python/xorbits/deploy/cluster/test.py deleted file mode 100644 index 77db33eea..000000000 --- a/python/xorbits/deploy/cluster/test.py +++ /dev/null @@ -1,9 +0,0 @@ -import xorbits -import xorbits.numpy as np - -# exp = SLURMCluster() -# address = exp.run() -# print(address) -# time.sleep(5) -xorbits.init("http://c1:16379") -print(np.random.rand(5, 5).mean()) diff --git a/python/xorbits/deploy/cluster/tests/__init__.py b/python/xorbits/deploy/cluster/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/deploy/cluster/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/deploy/cluster/tests/test_slurm.py b/python/xorbits/deploy/cluster/tests/test_slurm.py new file mode 100644 index 000000000..49ef730dc --- /dev/null +++ b/python/xorbits/deploy/cluster/tests/test_slurm.py @@ -0,0 +1,54 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import init +from ... import pandas as pd +from .. import SLURMCluster + + +def test_header(): + with SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") as cluster: + assert "#SBATCH" in cluster.commands + assert "#SBATCH -n 1" in cluster.commands + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem28G" in cluster.commands + assert "#SBATCH -t 00:02:00" in cluster.commands + assert "#SBATCH -p" not in cluster.commands + # assert "#SBATCH -A" not in cluster.commands + + with SLURMCluster( + queue="regular", + account="XorbitsOnSlurm", + processes=4, + cores=8, + memory="28G", + ) as cluster: + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem=28G" in cluster.commands + assert "#SBATCH -t " in cluster.commands + assert "#SBATCH -A XorbitsOnSlurm" in cluster.commands + assert "#SBATCH --partion regular" in cluster.commands + + +def test_jobscript(): + exp = SLURMCluster( + job_name="xorbits", + num_nodes=2, + output_dir="/shared_space/output.out", + time="00:30:00", + ) + address = exp.run() + assert address == "http://c1:16379" + init(address) + assert repr(pd.Series([1, 2, 3]).sum()) == "6" diff --git a/python/xorbits/tests/test_slurm.py b/python/xorbits/tests/test_slurm.py deleted file mode 100644 index bd1ea1bc9..000000000 --- a/python/xorbits/tests/test_slurm.py +++ /dev/null @@ -1,25 +0,0 @@ -from ..deploy.cluster import SLURMCluster - - -def test_header(): - with SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") as cluster: - assert "#SBATCH" in cluster.commands - assert "#SBATCH -n 1" in cluster.commands - assert "#SBATCH --cpus-per-task=8" in cluster.commands - assert "#SBATCH --mem28G" in cluster.commands - assert "#SBATCH -t 00:02:00" in cluster.commands - assert "#SBATCH -p" not in cluster.commands - # assert "#SBATCH -A" not in cluster.commands - - with SLURMCluster( - queue="regular", - account="XorbitsOnSlurm", - processes=4, - cores=8, - memory="28G", - ) as cluster: - assert "#SBATCH --cpus-per-task=8" in cluster.commands - assert "#SBATCH --mem=28G" in cluster.commands - assert "#SBATCH -t " in cluster.commands - assert "#SBATCH -A XorbitsOnSlurm" in cluster.commands - assert "#SBATCH --partion regular" in cluster.commands From abb344fd2218f182e5bad31a5058ca05f19df983 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Fri, 29 Sep 2023 10:45:35 +0800 Subject: [PATCH 16/59] change for workflow --- .github/workflows/python.yaml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index a508a6f9a..c83b95787 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -96,6 +96,7 @@ jobs: - { os: self-hosted, module: gpu, python-version: 3.9} - { os: ubuntu-latest, module: jax, python-version: 3.9 } - { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 } + - { os: slurm-ci, module: slurm, python-version: 3.9 } - { os: ubuntu-latest, module: datasets, python-version: 3.9 } steps: - name: Check out code @@ -247,6 +248,18 @@ jobs: python setup.py build_ext -i working-directory: ./python + - name: Slurm Setup Job queuing system + if: ${{ matrix.module == 'slurm' }} + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_before_install + + - name: Slurm Install xorbits + if: ${{ matrix.module == 'slurm' }} + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_install + - name: Install on GPU if: ${{ matrix.module == 'gpu' }} run: | @@ -285,6 +298,10 @@ jobs: pytest --ignore xorbits/_mars/ --timeout=1500 \ -W ignore::PendingDeprecationWarning \ --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/kubernetes/external_storage/juicefs + elif [[ "$MODULE" == "slurm" ]]; then + pytest --ignore xorbits/_mars/ --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster elif [[ "$MODULE" == "hadoop" ]]; then export WITH_HADOOP="1" export HADOOP_HOME="/usr/local/hadoop" @@ -376,6 +393,13 @@ jobs: fi working-directory: ./python + + - name: Cleanup on slurm + if: ${{ matrix.module == 'slurm' }} + run: | + source CI/${{ matrix.jobqueue }}.sh + jobqueue_after_script + - name: Report coverage data uses: codecov/codecov-action@v3 with: From 4470dcf09dac7cbccdf7046fd8f14207baae10fb Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Fri, 29 Sep 2023 10:51:00 +0800 Subject: [PATCH 17/59] change for workflow --- .github/workflows/python.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index c83b95787..7ff4f1d01 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -299,9 +299,12 @@ jobs: -W ignore::PendingDeprecationWarning \ --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/kubernetes/external_storage/juicefs elif [[ "$MODULE" == "slurm" ]]; then - pytest --ignore xorbits/_mars/ --timeout=1500 \ + docker exec c1 /bin/bash -c "pip install xorbits" + docker exec c2 /bin/bash -c "pip install xorbits" + docker exec slurmctld /bin/bash -c \ + "pytest --ignore xorbits/_mars/ --timeout=1500 \ -W ignore::PendingDeprecationWarning \ - --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster + --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster" elif [[ "$MODULE" == "hadoop" ]]; then export WITH_HADOOP="1" export HADOOP_HOME="/usr/local/hadoop" From d52c6cc0386730117f543a904fe95185aa609ef3 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Fri, 29 Sep 2023 10:54:45 +0800 Subject: [PATCH 18/59] change for workflow --- .github/workflows/python.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 7ff4f1d01..e55a491b5 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -251,13 +251,13 @@ jobs: - name: Slurm Setup Job queuing system if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.jobqueue }}.sh + source CI/${{ matrix.module }}.sh jobqueue_before_install - name: Slurm Install xorbits if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.jobqueue }}.sh + source CI/${{ matrix.module }}.sh jobqueue_install - name: Install on GPU @@ -400,7 +400,7 @@ jobs: - name: Cleanup on slurm if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.jobqueue }}.sh + source CI/${{ matrix.module }}.sh jobqueue_after_script - name: Report coverage data From a61defd7a22a244f8fed4b56d6d4131accf83b2e Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Fri, 29 Sep 2023 11:08:20 +0800 Subject: [PATCH 19/59] support for slurm --- python/xorbits/deploy/{cluster => slurm}/__init__.py | 0 python/xorbits/deploy/{cluster => slurm}/slurm.py | 0 python/xorbits/deploy/{cluster => slurm}/tests/__init__.py | 0 python/xorbits/deploy/{cluster => slurm}/tests/test_slurm.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename python/xorbits/deploy/{cluster => slurm}/__init__.py (100%) rename python/xorbits/deploy/{cluster => slurm}/slurm.py (100%) rename python/xorbits/deploy/{cluster => slurm}/tests/__init__.py (100%) rename python/xorbits/deploy/{cluster => slurm}/tests/test_slurm.py (100%) diff --git a/python/xorbits/deploy/cluster/__init__.py b/python/xorbits/deploy/slurm/__init__.py similarity index 100% rename from python/xorbits/deploy/cluster/__init__.py rename to python/xorbits/deploy/slurm/__init__.py diff --git a/python/xorbits/deploy/cluster/slurm.py b/python/xorbits/deploy/slurm/slurm.py similarity index 100% rename from python/xorbits/deploy/cluster/slurm.py rename to python/xorbits/deploy/slurm/slurm.py diff --git a/python/xorbits/deploy/cluster/tests/__init__.py b/python/xorbits/deploy/slurm/tests/__init__.py similarity index 100% rename from python/xorbits/deploy/cluster/tests/__init__.py rename to python/xorbits/deploy/slurm/tests/__init__.py diff --git a/python/xorbits/deploy/cluster/tests/test_slurm.py b/python/xorbits/deploy/slurm/tests/test_slurm.py similarity index 100% rename from python/xorbits/deploy/cluster/tests/test_slurm.py rename to python/xorbits/deploy/slurm/tests/test_slurm.py From 9d24d65296b7a9fd8c42f98ed0584f648c12bb56 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Sat, 30 Sep 2023 03:43:15 +0800 Subject: [PATCH 20/59] pytest add for xorbits slurm deploy --- python/xorbits/deploy/slurm/slurm.py | 13 +++-- python/xorbits/deploy/slurm/tests/slurm.sh | 25 ++++++++ .../xorbits/deploy/slurm/tests/test_slurm.py | 57 ++++++++++++------- 3 files changed, 70 insertions(+), 25 deletions(-) create mode 100755 python/xorbits/deploy/slurm/tests/slurm.sh diff --git a/python/xorbits/deploy/slurm/slurm.py b/python/xorbits/deploy/slurm/slurm.py index 756a56d0f..bd521cadc 100644 --- a/python/xorbits/deploy/slurm/slurm.py +++ b/python/xorbits/deploy/slurm/slurm.py @@ -30,8 +30,8 @@ def __init__( num_nodes=None, partition_option=None, load_env=None, - output_dir=None, - error_dir=None, + output_path=None, + error_path=None, work_dir=None, time=None, walltime=None, @@ -46,7 +46,7 @@ def __init__( self.job_name = job_name self.num_nodes = num_nodes self.partition_option = partition_option - self.output_dir = output_dir + self.output_path = output_path self.work_dir = work_dir self.walltime = walltime self.time = time @@ -54,14 +54,15 @@ def __init__( self.cores = cores self.memory = memory self.load_env = load_env - self.error_dir = error_dir + self.error_path = error_path self.account = account slurm_params = { "J": self.job_name, "nodes": self.num_nodes, "partition": self.partition_option, - "output": self.output_dir, - "workdir": self.work_dir, + "error": self.error_path, + "output": self.output_path, + "chdir": self.work_dir, "time": self.time, "ntasks": self.processes, "cpus-per-task": self.cores, diff --git a/python/xorbits/deploy/slurm/tests/slurm.sh b/python/xorbits/deploy/slurm/tests/slurm.sh new file mode 100755 index 000000000..b41bcdc5a --- /dev/null +++ b/python/xorbits/deploy/slurm/tests/slurm.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH -J xorbits +#SBATCH --nodes=2 +#SBATCH --output=/shared_space/output.out +#SBATCH --time=00:30:00 +set -x +nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) +head_node=${nodes_array[0]} +port=16380 +web_port=16379 +echo "Starting SUPERVISOR at ${head_node}" +srun --nodes=1 --ntasks=1 -w "${head_node}" \ + xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"& +sleep 30 +worker_num=$((SLURM_JOB_NUM_NODES - 1)) +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + port_i=$((port + i)) + echo "Starting WORKER $i at ${node_i}" + srun --nodes=1 --ntasks=1 -w "${node_i}" \ + xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& +done +sleep 300 +address=http://"${head_node}":"${web_port}" diff --git a/python/xorbits/deploy/slurm/tests/test_slurm.py b/python/xorbits/deploy/slurm/tests/test_slurm.py index 49ef730dc..9e6562af5 100644 --- a/python/xorbits/deploy/slurm/tests/test_slurm.py +++ b/python/xorbits/deploy/slurm/tests/test_slurm.py @@ -13,39 +13,58 @@ # limitations under the License. from .... import init -from ... import pandas as pd +from .... import pandas as pd from .. import SLURMCluster -def test_header(): - with SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") as cluster: - assert "#SBATCH" in cluster.commands - assert "#SBATCH -n 1" in cluster.commands - assert "#SBATCH --cpus-per-task=8" in cluster.commands - assert "#SBATCH --mem28G" in cluster.commands - assert "#SBATCH -t 00:02:00" in cluster.commands - assert "#SBATCH -p" not in cluster.commands - # assert "#SBATCH -A" not in cluster.commands +def test_header_core_process_memory(): + cluster = SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") + assert "#SBATCH" in cluster.commands + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem=28G" in cluster.commands + assert "#SBATCH --time=00:02:00" in cluster.commands + assert "#SBATCH -A" not in cluster.commands - with SLURMCluster( - queue="regular", + +def test_header_partition_account(): + cluster = SLURMCluster( + partition_option="regular", account="XorbitsOnSlurm", processes=4, cores=8, memory="28G", - ) as cluster: - assert "#SBATCH --cpus-per-task=8" in cluster.commands - assert "#SBATCH --mem=28G" in cluster.commands - assert "#SBATCH -t " in cluster.commands - assert "#SBATCH -A XorbitsOnSlurm" in cluster.commands - assert "#SBATCH --partion regular" in cluster.commands + ) + assert "#SBATCH --cpus-per-task=8" in cluster.commands + assert "#SBATCH --mem=28G" in cluster.commands + assert "#SBATCH -A XorbitsOnSlurm" in cluster.commands + assert "#SBATCH --partition=regular" in cluster.commands + + +def test_header_work_outputdir_web(): + # Test additional parameters + cluster = SLURMCluster( + job_name="my_job", + num_nodes=10, + output_path="/path/to/output", + work_dir="/path/to/work", + error_path="/path/to/error", + webport=8080, + load_env="xorbits", + ) + assert "#SBATCH -J my_job" in cluster.commands + assert "#SBATCH --nodes=10" in cluster.commands + assert "#SBATCH --output=/path/to/output" in cluster.commands + assert "#SBATCH --chdir=/path/to/work" in cluster.commands + assert "#SBATCH --error=/path/to/error" in cluster.commands + assert "web_port=8080" in cluster.commands + assert "source activate xorbits" in cluster.commands def test_jobscript(): exp = SLURMCluster( job_name="xorbits", num_nodes=2, - output_dir="/shared_space/output.out", + output_path="/shared_space/output.out", time="00:30:00", ) address = exp.run() From 50e3d80c361d70dfed77b0ceb79eba75cb958a07 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Sat, 30 Sep 2023 03:45:35 +0800 Subject: [PATCH 21/59] pytest add for xorbits slurm deploy --- CI/slurm.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CI/slurm.sh b/CI/slurm.sh index 962123d07..0fc6c6840 100644 --- a/CI/slurm.sh +++ b/CI/slurm.sh @@ -34,8 +34,10 @@ function jobqueue_install { function jobqueue_script { docker exec c1 /bin/bash -c "pip install xorbits" docker exec c2 /bin/bash -c "pip install xorbits" - docker exec slurmctld /bin/bash -c "python /xorbits/python/xorbits/deploy/cluster/Slurm.py" - docker exec slurmctld /bin/bash -c "cat /shared_space/output.out" + docker exec slurmctld /bin/bash -c \ + "pytest --ignore xorbits/_mars/ --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster" } function jobqueue_after_script { From 19f255d8f0795441f71772bdb5191266ccfcc9fb Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 06:54:33 +0800 Subject: [PATCH 22/59] t p modified: .github/workflows/python.yaml modified: .gitignore modified: python/setup.cfg --- .github/workflows/python.yaml | 2 +- .gitignore | 1 + python/setup.cfg | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index e55a491b5..6d252d62b 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -304,7 +304,7 @@ jobs: docker exec slurmctld /bin/bash -c \ "pytest --ignore xorbits/_mars/ --timeout=1500 \ -W ignore::PendingDeprecationWarning \ - --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster" + --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/slurm" elif [[ "$MODULE" == "hadoop" ]]; then export WITH_HADOOP="1" export HADOOP_HOME="/usr/local/hadoop" diff --git a/.gitignore b/.gitignore index b0b6eac07..602cc2a19 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ # C extensions *.so + # Distribution / packaging .Python build/ diff --git a/python/setup.cfg b/python/setup.cfg index ec6576c65..fb7755738 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -113,6 +113,7 @@ omit = xorbits/_version.py *.pxd */tests/* + xorbits/deploy/slurm/slurm.py xorbits/deploy/kubernetes/core.py xorbits/deploy/kubernetes/supervisor.py xorbits/deploy/kubernetes/worker.py From 5900cadf94e0d9566f47b59285b5fcbf168ce5b9 Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:58:49 -0700 Subject: [PATCH 23/59] Delete CI/slurm.sh Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- CI/slurm.sh | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) delete mode 100644 CI/slurm.sh diff --git a/CI/slurm.sh b/CI/slurm.sh deleted file mode 100644 index 0fc6c6840..000000000 --- a/CI/slurm.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -function jobqueue_before_install { - docker version - docker-compose version - - # start slurm cluster - cd ./CI/slurm - docker-compose pull - ./start-slurm.sh - cd - - - #Set shared space permissions - docker exec slurmctld /bin/bash -c "chmod -R 777 /shared_space" - - docker ps -a - docker images - show_network_interfaces -} - -function show_network_interfaces { - for c in slurmctld c1 c2; do - echo '------------------------------------------------------------' - echo docker container: $c - docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' - echo '------------------------------------------------------------' - done -} - -function jobqueue_install { - docker exec slurmctld /bin/bash -c "cd xorbits/python/; pip install -e ." -} - -function jobqueue_script { - docker exec c1 /bin/bash -c "pip install xorbits" - docker exec c2 /bin/bash -c "pip install xorbits" - docker exec slurmctld /bin/bash -c \ - "pytest --ignore xorbits/_mars/ --timeout=1500 \ - -W ignore::PendingDeprecationWarning \ - --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster" -} - -function jobqueue_after_script { - docker exec slurmctld bash -c 'sinfo' - docker exec slurmctld bash -c 'squeue' - docker exec slurmctld bash -c 'sacct -l' -} \ No newline at end of file From 86ed567ec21cd659ff17882758d2021577c3ab49 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:00:48 +0800 Subject: [PATCH 24/59] modified: .gitignore deleted: python/xorbits/deploy/slurm/tests/slurm.sh --- .gitignore | 5 ++++- python/xorbits/deploy/slurm/tests/slurm.sh | 25 ---------------------- 2 files changed, 4 insertions(+), 26 deletions(-) delete mode 100755 python/xorbits/deploy/slurm/tests/slurm.sh diff --git a/.gitignore b/.gitignore index 602cc2a19..d2ba47490 100644 --- a/.gitignore +++ b/.gitignore @@ -151,4 +151,7 @@ doc/source/savefig/ asv/results -.DS_Store \ No newline at end of file +.DS_Store + +#slrm.sh generated sh +python/xorbits/deploy/slurm/tests/slurm.sh \ No newline at end of file diff --git a/python/xorbits/deploy/slurm/tests/slurm.sh b/python/xorbits/deploy/slurm/tests/slurm.sh deleted file mode 100755 index b41bcdc5a..000000000 --- a/python/xorbits/deploy/slurm/tests/slurm.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -#SBATCH -J xorbits -#SBATCH --nodes=2 -#SBATCH --output=/shared_space/output.out -#SBATCH --time=00:30:00 -set -x -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) -head_node=${nodes_array[0]} -port=16380 -web_port=16379 -echo "Starting SUPERVISOR at ${head_node}" -srun --nodes=1 --ntasks=1 -w "${head_node}" \ - xorbits-supervisor -H "${head_node}" -p "${port}" -w "${web_port}"& -sleep 30 -worker_num=$((SLURM_JOB_NUM_NODES - 1)) -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - port_i=$((port + i)) - echo "Starting WORKER $i at ${node_i}" - srun --nodes=1 --ntasks=1 -w "${node_i}" \ - xorbits-worker -H "${node_i}" -p "${port_i}" -s "${head_node}":"${port}"& -done -sleep 300 -address=http://"${head_node}":"${web_port}" From 8a69102789b94607b54ab1b34f6432c8b5f1a352 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:06:05 +0800 Subject: [PATCH 25/59] new file: CI/slurm.sh --- CI/slurm.sh | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 CI/slurm.sh diff --git a/CI/slurm.sh b/CI/slurm.sh new file mode 100644 index 000000000..3b919c41b --- /dev/null +++ b/CI/slurm.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +function jobqueue_before_install { + docker version + docker-compose version + + # start slurm cluster + cd ./CI/slurm + docker-compose pull + ./start-slurm.sh + cd - + + #Set shared space permissions + docker exec slurmctld /bin/bash -c "chmod -R 777 /shared_space" + + docker ps -a + docker images + show_network_interfaces +} + +function show_network_interfaces { + for c in slurmctld c1 c2; do + echo '------------------------------------------------------------' + echo docker container: $c + docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' + echo '------------------------------------------------------------' + done +} + +function jobqueue_install { + docker exec slurmctld /bin/bash -c "cd xorbits/python/; pip install -e ." +} + +function jobqueue_script { + docker exec c1 /bin/bash -c "pip install xorbits" + docker exec c2 /bin/bash -c "pip install xorbits" + docker exec slurmctld /bin/bash -c \ + "pytest --ignore xorbits/_mars/ --timeout=1500 \ + -W ignore::PendingDeprecationWarning \ + --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/slurm" +} + +function jobqueue_after_script { + docker exec slurmctld bash -c 'sinfo' + docker exec slurmctld bash -c 'squeue' + docker exec slurmctld bash -c 'sacct -l' +} \ No newline at end of file From 8cabb9fa524299a089c3f571b468587b492f438e Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:11:52 +0800 Subject: [PATCH 26/59] t modified: ../CI/slurm.sh modified: ../CI/slurm/start-slurm.sh --- CI/slurm.sh | 13 +++++++++++++ CI/slurm/start-slurm.sh | 14 +++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CI/slurm.sh b/CI/slurm.sh index 3b919c41b..7386f6fa8 100644 --- a/CI/slurm.sh +++ b/CI/slurm.sh @@ -1,4 +1,17 @@ #!/usr/bin/env bash +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. function jobqueue_before_install { docker version diff --git a/CI/slurm/start-slurm.sh b/CI/slurm/start-slurm.sh index 6cdce2db1..f1936b58a 100755 --- a/CI/slurm/start-slurm.sh +++ b/CI/slurm/start-slurm.sh @@ -1,5 +1,17 @@ #!/bin/bash - +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. docker-compose up -d --no-build while [ `./register_cluster.sh 2>&1 | grep "sacctmgr: error" | wc -l` -ne 0 ] From fb6630b3f55980ab7795f413ce6d35df06aa9aca Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:30:13 +0800 Subject: [PATCH 27/59] deleted: .github/workflows/cluster.yaml modified: python/xorbits/deploy/slurm/slurm.py modified: python/xorbits/deploy/slurm/tests/test_slurm.py --- .github/workflows/cluster.yaml | 60 ------------------- python/xorbits/deploy/slurm/slurm.py | 1 - .../xorbits/deploy/slurm/tests/test_slurm.py | 6 ++ 3 files changed, 6 insertions(+), 61 deletions(-) delete mode 100644 .github/workflows/cluster.yaml diff --git a/.github/workflows/cluster.yaml b/.github/workflows/cluster.yaml deleted file mode 100644 index a0b7295ad..000000000 --- a/.github/workflows/cluster.yaml +++ /dev/null @@ -1,60 +0,0 @@ -name: CI - -on: [push, pull_request] - -jobs: - build: - runs-on: ubuntu-latest - timeout-minutes: 30 - defaults: - run: - shell: bash -l {0} - strategy: - fail-fast: false - matrix: - jobqueue: ["slurm"] - - steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.7.0 - with: - access_token: ${{ github.token }} - - name: Checkout source - uses: actions/checkout@v2 - - - name: Setup Empty Conda Environment with Mamba - if: matrix.jobqueue == 'none' - uses: conda-incubator/setup-miniconda@v2 - with: - channels: conda-forge - mamba-version: "*" - activate-environment: xorbits - auto-activate-base: false - - - name: Setup xorbits conda environment - if: matrix.jobqueue == 'none' - run: | - mamba env update -f CI/requirements-wheel.txt - mamba list - - - name: Setup Job queuing system - if: matrix.jobqueue != 'none' - run: | - source CI/${{ matrix.jobqueue }}.sh - jobqueue_before_install - - - name: Install xorbits - run: | - source CI/${{ matrix.jobqueue }}.sh - jobqueue_install - - - name: Test - run: | - source CI/${{ matrix.jobqueue }}.sh - jobqueue_script - - - name: Cleanup - if: always() - run: | - source CI/${{ matrix.jobqueue }}.sh - jobqueue_after_script \ No newline at end of file diff --git a/python/xorbits/deploy/slurm/slurm.py b/python/xorbits/deploy/slurm/slurm.py index bd521cadc..145dc34e7 100644 --- a/python/xorbits/deploy/slurm/slurm.py +++ b/python/xorbits/deploy/slurm/slurm.py @@ -19,7 +19,6 @@ import time # Configure logging -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/python/xorbits/deploy/slurm/tests/test_slurm.py b/python/xorbits/deploy/slurm/tests/test_slurm.py index 9e6562af5..3b7c70529 100644 --- a/python/xorbits/deploy/slurm/tests/test_slurm.py +++ b/python/xorbits/deploy/slurm/tests/test_slurm.py @@ -11,11 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from distutils.spawn import find_executable + +import pytest from .... import init from .... import pandas as pd from .. import SLURMCluster +slurm_available = find_executable("sbatch") is not None + def test_header_core_process_memory(): cluster = SLURMCluster(time="00:02:00", processes=4, cores=8, memory="28G") @@ -60,6 +65,7 @@ def test_header_work_outputdir_web(): assert "source activate xorbits" in cluster.commands +@pytest.mark.skipif(not slurm_available, reason="Cannot run without slurm cluster") def test_jobscript(): exp = SLURMCluster( job_name="xorbits", From f7539eebc58a659f70488a33b1a149879cac6c68 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:34:22 +0800 Subject: [PATCH 28/59] t renamed: slurm.sh -> slurm/slurm.sh --- CI/{ => slurm}/slurm.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename CI/{ => slurm}/slurm.sh (100%) diff --git a/CI/slurm.sh b/CI/slurm/slurm.sh similarity index 100% rename from CI/slurm.sh rename to CI/slurm/slurm.sh From 49541bb97b5b900078f8f6c0c8797a2ea08bec08 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 07:37:55 +0800 Subject: [PATCH 29/59] modified: ../python/xorbits/deploy/slurm/tests/test_slurm.py --- python/xorbits/deploy/slurm/tests/test_slurm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/xorbits/deploy/slurm/tests/test_slurm.py b/python/xorbits/deploy/slurm/tests/test_slurm.py index 3b7c70529..51eb791b9 100644 --- a/python/xorbits/deploy/slurm/tests/test_slurm.py +++ b/python/xorbits/deploy/slurm/tests/test_slurm.py @@ -65,6 +65,7 @@ def test_header_work_outputdir_web(): assert "source activate xorbits" in cluster.commands +# Construct slurm in a docker environment, so this test could only be exec when there is sbatch command supported @pytest.mark.skipif(not slurm_available, reason="Cannot run without slurm cluster") def test_jobscript(): exp = SLURMCluster( From ecc9dcf766b9e536beefa79adc7bf10a97d9c3b5 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 11:08:53 +0800 Subject: [PATCH 30/59] modified: .github/workflows/python.yaml modified: python/xorbits/deploy/slurm/slurm.py --- .github/workflows/python.yaml | 6 ++--- python/xorbits/deploy/slurm/slurm.py | 40 +++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 6d252d62b..c9bb110bc 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -251,13 +251,13 @@ jobs: - name: Slurm Setup Job queuing system if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.module }}.sh + source CI/slurm/${{ matrix.module }}.sh jobqueue_before_install - name: Slurm Install xorbits if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.module }}.sh + source CI/slurm/${{ matrix.module }}.sh jobqueue_install - name: Install on GPU @@ -400,7 +400,7 @@ jobs: - name: Cleanup on slurm if: ${{ matrix.module == 'slurm' }} run: | - source CI/${{ matrix.module }}.sh + source CI/slurm/${{ matrix.module }}.sh jobqueue_after_script - name: Report coverage data diff --git a/python/xorbits/deploy/slurm/slurm.py b/python/xorbits/deploy/slurm/slurm.py index 145dc34e7..3dc333570 100644 --- a/python/xorbits/deploy/slurm/slurm.py +++ b/python/xorbits/deploy/slurm/slurm.py @@ -33,13 +33,46 @@ def __init__( error_path=None, work_dir=None, time=None, - walltime=None, processes=None, cores=None, memory=None, account=None, webport=16379, + **kwargs, ): + """ + The entrance of deploying a SLURM cluster. + + Parameters + ---------- + job_name : str, optional + Name of the Slurm job, by default None + num_nodes : int, optional + Number of nodes in the Slurm cluster, by default None + partition_option : str, optional + Request a specific partition for the resource allocation, by default None + load_env : str, optional + Conda Environment to load, by default None + output_path : str, optional + Path for Log output, by default None + error_path : str, optional + Path for Log error, by default None + work_dir : str, optional + Slurm‘s Working directory,the default place to receive the logs and result, by default None + time : str, optional + Minimum time limit on the job allocation, by default None + processes : int, optional + Number of processes, by default None + cores : int, optional + Number of cores, by default None + memory : str, optional + Specify the real memory required per node. Default units are megabytes, by default None + account : str, optional + Charge resources used by this job to specified account, by default None + webport : int, optional + Xorbits' Web port, by default 16379 + If user have some specifics needing for can just follow the slurm interface we add it at the end automatically + """ commands = ["#!/bin/bash"] self.job_name = job_name @@ -47,7 +80,6 @@ def __init__( self.partition_option = partition_option self.output_path = output_path self.work_dir = work_dir - self.walltime = walltime self.time = time self.processes = processes self.cores = cores @@ -67,11 +99,13 @@ def __init__( "cpus-per-task": self.cores, "mem": self.memory, "A": self.account, + **kwargs, } self.commands = None self.web_port = webport for param, value in slurm_params.items(): if value is not None: + # there are two modes of sbatch, one is like --time, the other one is like -A,so i just judge it by using len if len(str(param)) > 1: commands.append(f"#SBATCH --{param}={value}") else: @@ -102,7 +136,7 @@ def __init__( "sleep 300", 'address=http://"${head_node}":"${web_port}"', ] - + # here I give a very long sleep time to avoid when supervisor nodes don't start, and the other node can't find the supervisor node self.commands = "\n".join(commands) def run(self): From c24f086e49d44b367f9ad74d58c2328159ffb1e2 Mon Sep 17 00:00:00 2001 From: xiaoyu <4797136@qq.com> Date: Mon, 16 Oct 2023 11:34:57 +0800 Subject: [PATCH 31/59] modified: doc/source/user_guide/deployment_slurm.rst modified: python/xorbits/deploy/slurm/slurm.py --- doc/source/user_guide/deployment_slurm.rst | 103 +++++++++++++++++++++ python/xorbits/deploy/slurm/slurm.py | 11 ++- 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 47aa97b41..25d21df58 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -219,3 +219,106 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" +## Initialization + +To create a `SLURMCluster` instance, you can use the following parameters: + +- `job_name` (str, optional): Name of the Slurm job. +- `num_nodes` (int, optional): Number of nodes in the Slurm cluster. +- `partition_option` (str, optional): Request a specific partition for the resource allocation. +- `load_env` (str, optional): Conda Environment to load. +- `output_path` (str, optional): Path for Log output. +- `error_path` (str, optional): Path for Log error. +- `work_dir` (str, optional): Slurm's Working directory, the default place to receive the logs and results. +- `time` (str, optional): Minimum time limit on the job allocation. +- `processes` (int, optional): Number of processes. +- `cores` (int, optional): Number of cores. +- `memory` (str, optional): Specify the real memory required per node. Default units are megabytes. +- `account` (str, optional): Charge resources used by this job to the specified account. +- `webport` (int, optional): Xorbits' Web port. +- `**kwargs`: Additional parameters that can be added using the slurm interface. + +Example usage: + +```python +cluster = SLURMCluster( + job_name="my_job", + num_nodes=4, + partition_option="compute", + load_env="my_env", + output_path="logs/output.log", + error_path="logs/error.log", + work_dir="/path/to/work_dir", + time="1:00:00", + processes=8, + cores=2, + memory="8G", + account="my_account", + webport=16379, + custom_param1="value1", + custom_param2="value2" +) +``` + +## Running the Job + +To submit the job to SLURM, use the `run()` method. Then it will return the address. + +Example usage: + +```python +address = cluster.run() +``` + +## Getting Job Information + +- `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. + +Example usage: + +```python +job_id = cluster.get_job_id() +``` + +- `cancel_job()`: This method cancels the job using the `scancel` command. We have designed a hook so that while the programming cancel, the slurm task will also cancel. + +Example usage: + +```python +cluster.cancel_job(job_id) +``` + +- `update_head_node()`: This method retrieves the head node information from the SLURM job. + +Example usage: + +```python +cluster.update_head_node() +``` + +- `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. + +Example usage: + +```python +job_address = cluster.get_job_address() +``` + +## Example + +Here's an example of how to use the `SLURMCluster` class: + +```python +import Xorbits +from xorbits.deploy.slurm import SLURMCluster + +test_cluster = SLURMCluster( + job_name="xorbits", + num_nodes=2, + output_path="/shared_space/output.out", + time="00:30:00", + ) +address = test_cluster.run() +xorbits.init(address) +assert (pd.Series([1, 2, 3]).sum()) == "6" +``` \ No newline at end of file diff --git a/python/xorbits/deploy/slurm/slurm.py b/python/xorbits/deploy/slurm/slurm.py index 3dc333570..26c185d0c 100644 --- a/python/xorbits/deploy/slurm/slurm.py +++ b/python/xorbits/deploy/slurm/slurm.py @@ -138,6 +138,7 @@ def __init__( ] # here I give a very long sleep time to avoid when supervisor nodes don't start, and the other node can't find the supervisor node self.commands = "\n".join(commands) + self.sbatch_out = "" def run(self): shell_commands = self.commands @@ -150,7 +151,8 @@ def run(self): if result.returncode == 0: logger.info("Job submitted successfully.") - self.job_id = self.get_job_id(result.stdout) + self.sbatch_out = result.stdout + self.job_id = self.get_job_id() if self.job_id: logger.info(f"Job ID is {self.job_id}.") atexit.register(self.cancel_job) @@ -164,13 +166,18 @@ def run(self): return None - def get_job_id(self, sbatch_output): + def get_job_id(self): + sbatch_output = self.sbatch_out job_id = None for line in sbatch_output.split("\n"): if "Submitted batch job" in line: job_id = line.split(" ")[-1] return job_id + def get_sbatch_out(self): + logging.info(f"getting batch_out:{self.sbatch_out}") + return self.sbatch_out + def cancel_job(self): if self.job_id: logger.info(f"Cancelling job {self.job_id}") From 4797b4d60e0e76d7ed7997605ab4c2d54ff91132 Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Wed, 25 Oct 2023 14:44:36 -0700 Subject: [PATCH 32/59] Update python.yaml Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- .github/workflows/python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index c9bb110bc..58ea20da3 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -96,7 +96,7 @@ jobs: - { os: self-hosted, module: gpu, python-version: 3.9} - { os: ubuntu-latest, module: jax, python-version: 3.9 } - { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 } - - { os: slurm-ci, module: slurm, python-version: 3.9 } + - { os: ubuntu-latest, module: slurm, python-version: 3.9 } - { os: ubuntu-latest, module: datasets, python-version: 3.9 } steps: - name: Check out code From 1a66e1e7c16c4d562094835aad2dd5e86da21c26 Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Mon, 30 Oct 2023 00:30:43 -0700 Subject: [PATCH 33/59] Update python.yaml Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- .github/workflows/python.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 58ea20da3..feb75e3d0 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -302,9 +302,7 @@ jobs: docker exec c1 /bin/bash -c "pip install xorbits" docker exec c2 /bin/bash -c "pip install xorbits" docker exec slurmctld /bin/bash -c \ - "pytest --ignore xorbits/_mars/ --timeout=1500 \ - -W ignore::PendingDeprecationWarning \ - --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/slurm" + "pytest /xorbits/python/xorbits/deploy/slurm/tests/test_slurm.py " elif [[ "$MODULE" == "hadoop" ]]; then export WITH_HADOOP="1" export HADOOP_HOME="/usr/local/hadoop" From 6fdd0ac12f14c4fbca40a18680775bd39707ebfa Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Mon, 30 Oct 2023 00:34:36 -0700 Subject: [PATCH 34/59] Update .pre-commit-config.yaml Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- .pre-commit-config.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a503d42dd..8bb0192af 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,3 +32,8 @@ repos: - id: codespell exclude: _mars/lib args: [--config, python/setup.cfg] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0 # Use the sha or tag you want to point at + hooks: + - id: prettier + types_or: [html, javascript] From d2f815df473c076ed34493be90df869c19a0254d Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Mon, 30 Oct 2023 00:36:09 -0700 Subject: [PATCH 35/59] Update .pre-commit-config.yaml Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8bb0192af..c1ed9e8c5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,14 +26,14 @@ repos: additional_dependencies: [tokenize-rt==3.2.0] exclude: _mars args: [--ignore-missing-imports, --follow-imports, skip] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0 # Use the sha or tag you want to point at + hooks: + - id: prettier + types_or: [html, javascript] - repo: https://github.com/codespell-project/codespell rev: v2.2.6 hooks: - id: codespell exclude: _mars/lib args: [--config, python/setup.cfg] - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.0 # Use the sha or tag you want to point at - hooks: - - id: prettier - types_or: [html, javascript] From ee3b1a59f69ab3894c4fae895745921096af03b8 Mon Sep 17 00:00:00 2001 From: liddle rain <57928993+fengsxy@users.noreply.github.com> Date: Mon, 30 Oct 2023 00:40:15 -0700 Subject: [PATCH 36/59] Update deployment_slurm.rst Signed-off-by: liddle rain <57928993+fengsxy@users.noreply.github.com> --- doc/source/user_guide/deployment_slurm.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 25d21df58..181e7c701 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -3,6 +3,7 @@ ================== SLURM deployment ================== +# Script Method If you have access to a SLURM cluster, you can refer to the following guide to run an Xorbits job. Other HPC job schedulers like Torque or LSF are similar. You are recommended to read the :ref:`cluster deployment ` first to know some basic knowledge of a Xorbits cluster. @@ -219,6 +220,8 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" +# Code Method +--- ## Initialization To create a `SLURMCluster` instance, you can use the following parameters: @@ -321,4 +324,4 @@ test_cluster = SLURMCluster( address = test_cluster.run() xorbits.init(address) assert (pd.Series([1, 2, 3]).sum()) == "6" -``` \ No newline at end of file +``` From a26ea0882e7bb5fee5fbf70f1a15ea0ad757faa5 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 08:41:28 -0700 Subject: [PATCH 37/59] Update .gitignore Signed-off-by: liddle rain --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d2ba47490..27fc759e2 100644 --- a/.gitignore +++ b/.gitignore @@ -150,8 +150,7 @@ static/ doc/source/savefig/ asv/results - .DS_Store #slrm.sh generated sh -python/xorbits/deploy/slurm/tests/slurm.sh \ No newline at end of file +python/xorbits/deploy/slurm/tests/slurm.sh From a12d13605b67fa1ae719fff13afcea7b263564d1 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 08:43:43 -0700 Subject: [PATCH 38/59] Update .gitignore Signed-off-by: liddle rain --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 27fc759e2..a27ce3f69 100644 --- a/.gitignore +++ b/.gitignore @@ -150,6 +150,7 @@ static/ doc/source/savefig/ asv/results + .DS_Store #slrm.sh generated sh From 09515645c6bb0090d5c78722c28c69891d664a22 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 20:47:40 -0700 Subject: [PATCH 39/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 207 ++++++++++----------- 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 181e7c701..cafa4c394 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -3,7 +3,9 @@ ================== SLURM deployment ================== -# Script Method + +Script Method +********** If you have access to a SLURM cluster, you can refer to the following guide to run an Xorbits job. Other HPC job schedulers like Torque or LSF are similar. You are recommended to read the :ref:`cluster deployment ` first to know some basic knowledge of a Xorbits cluster. @@ -220,108 +222,103 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" -# Code Method ---- -## Initialization - -To create a `SLURMCluster` instance, you can use the following parameters: - -- `job_name` (str, optional): Name of the Slurm job. -- `num_nodes` (int, optional): Number of nodes in the Slurm cluster. -- `partition_option` (str, optional): Request a specific partition for the resource allocation. -- `load_env` (str, optional): Conda Environment to load. -- `output_path` (str, optional): Path for Log output. -- `error_path` (str, optional): Path for Log error. -- `work_dir` (str, optional): Slurm's Working directory, the default place to receive the logs and results. -- `time` (str, optional): Minimum time limit on the job allocation. -- `processes` (int, optional): Number of processes. -- `cores` (int, optional): Number of cores. -- `memory` (str, optional): Specify the real memory required per node. Default units are megabytes. -- `account` (str, optional): Charge resources used by this job to the specified account. -- `webport` (int, optional): Xorbits' Web port. -- `**kwargs`: Additional parameters that can be added using the slurm interface. - -Example usage: - -```python -cluster = SLURMCluster( - job_name="my_job", - num_nodes=4, - partition_option="compute", - load_env="my_env", - output_path="logs/output.log", - error_path="logs/error.log", - work_dir="/path/to/work_dir", - time="1:00:00", - processes=8, - cores=2, - memory="8G", - account="my_account", - webport=16379, - custom_param1="value1", - custom_param2="value2" -) -``` - -## Running the Job - -To submit the job to SLURM, use the `run()` method. Then it will return the address. - -Example usage: - -```python -address = cluster.run() -``` - -## Getting Job Information - -- `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. - -Example usage: - -```python -job_id = cluster.get_job_id() -``` +.. code:: python + + Initialization + ----------------- + + To create a `SLURMCluster` instance, you can use the following parameters: + + - `job_name` (str, optional): Name of the Slurm job. + - `num_nodes` (int, optional): Number of nodes in the Slurm cluster. + - `partition_option` (str, optional): Request a specific partition for the resource allocation. + - `load_env` (str, optional): Conda Environment to load. + - `output_path` (str, optional): Path for Log output. + - `error_path` (str, optional): Path for Log error. + - `work_dir` (str, optional): Slurm's Working directory, the default place to receive the logs and results. + - `time` (str, optional): Minimum time limit on the job allocation. + - `processes` (int, optional): Number of processes. + - `cores` (int, optional): Number of cores. + - `memory` (str, optional): Specify the real memory required per node. Default units are megabytes. + - `account` (str, optional): Charge resources used by this job to the specified account. + - `webport` (int, optional): Xorbits' Web port. + - `**kwargs`: Additional parameters that can be added using the slurm interface. + + Example usage:: + .. code-block:: python + cluster = SLURMCluster( + job_name="my_job", + num_nodes=4, + partition_option="compute", + load_env="my_env", + output_path="logs/output.log", + error_path="logs/error.log", + work_dir="/path/to/work_dir", + time="1:00:00", + processes=8, + cores=2, + memory="8G", + account="my_account", + webport=16379, + custom_param1="value1", + custom_param2="value2" + ) + + .. note:: + Modify the parameters as needed for your specific use case. + + Running the Job + --------------- + + To submit the job to SLURM, use the `run()` method. Then it will return the address. + + Example usage:: + + address = cluster.run() + + Getting Job Information + ----------------------- + + - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. + + Example usage:: + + job_id = cluster.get_job_id() + + - `cancel_job()`: This method cancels the job using the `scancel` command. We have designed a hook so that while the programming cancel, the slurm task will also cancel. + + Example usage:: + + cluster.cancel_job(job_id) + + - `update_head_node()`: This method retrieves the head node information from the SLURM job. + + Example usage:: + + cluster.update_head_node() + + - `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. + + Example usage:: + + job_address = cluster.get_job_address() + + Example + ------- + + Here's an example of how to use the `SLURMCluster` class:: + + .. code-block:: python + import Xorbits + from xorbits.deploy.slurm import SLURMCluster + + test_cluster = SLURMCluster( + job_name="xorbits", + num_nodes=2, + output_path="/shared_space/output.out", + time="00:30:00", + ) + address = test_cluster.run() + xorbits.init(address) + assert (pd.Series([1, 2, 3]).sum()) == "6" -- `cancel_job()`: This method cancels the job using the `scancel` command. We have designed a hook so that while the programming cancel, the slurm task will also cancel. - -Example usage: - -```python -cluster.cancel_job(job_id) -``` - -- `update_head_node()`: This method retrieves the head node information from the SLURM job. - -Example usage: - -```python -cluster.update_head_node() -``` - -- `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. - -Example usage: - -```python -job_address = cluster.get_job_address() -``` - -## Example - -Here's an example of how to use the `SLURMCluster` class: - -```python -import Xorbits -from xorbits.deploy.slurm import SLURMCluster - -test_cluster = SLURMCluster( - job_name="xorbits", - num_nodes=2, - output_path="/shared_space/output.out", - time="00:30:00", - ) -address = test_cluster.run() -xorbits.init(address) -assert (pd.Series([1, 2, 3]).sum()) == "6" -``` From 7729af6dc7b1d61eac6b74798782bfb706138d91 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 20:49:07 -0700 Subject: [PATCH 40/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index cafa4c394..d18f154dd 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -222,11 +222,11 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" -.. code:: python + Initialization ----------------- - + .. code:: python To create a `SLURMCluster` instance, you can use the following parameters: - `job_name` (str, optional): Name of the Slurm job. From df98447b0c1e475baff1be1e59db58e84f5ef303 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 20:52:39 -0700 Subject: [PATCH 41/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 199 +++++++++++---------- 1 file changed, 102 insertions(+), 97 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index d18f154dd..4c93b45d8 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -224,101 +224,106 @@ The SLURM script looks like this: - Initialization - ----------------- - .. code:: python - To create a `SLURMCluster` instance, you can use the following parameters: - - - `job_name` (str, optional): Name of the Slurm job. - - `num_nodes` (int, optional): Number of nodes in the Slurm cluster. - - `partition_option` (str, optional): Request a specific partition for the resource allocation. - - `load_env` (str, optional): Conda Environment to load. - - `output_path` (str, optional): Path for Log output. - - `error_path` (str, optional): Path for Log error. - - `work_dir` (str, optional): Slurm's Working directory, the default place to receive the logs and results. - - `time` (str, optional): Minimum time limit on the job allocation. - - `processes` (int, optional): Number of processes. - - `cores` (int, optional): Number of cores. - - `memory` (str, optional): Specify the real memory required per node. Default units are megabytes. - - `account` (str, optional): Charge resources used by this job to the specified account. - - `webport` (int, optional): Xorbits' Web port. - - `**kwargs`: Additional parameters that can be added using the slurm interface. - - Example usage:: - .. code-block:: python - cluster = SLURMCluster( - job_name="my_job", - num_nodes=4, - partition_option="compute", - load_env="my_env", - output_path="logs/output.log", - error_path="logs/error.log", - work_dir="/path/to/work_dir", - time="1:00:00", - processes=8, - cores=2, - memory="8G", - account="my_account", - webport=16379, - custom_param1="value1", - custom_param2="value2" - ) - - .. note:: - Modify the parameters as needed for your specific use case. - - Running the Job - --------------- - - To submit the job to SLURM, use the `run()` method. Then it will return the address. - - Example usage:: - - address = cluster.run() - - Getting Job Information - ----------------------- - - - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. - - Example usage:: - - job_id = cluster.get_job_id() - - - `cancel_job()`: This method cancels the job using the `scancel` command. We have designed a hook so that while the programming cancel, the slurm task will also cancel. - - Example usage:: - - cluster.cancel_job(job_id) - - - `update_head_node()`: This method retrieves the head node information from the SLURM job. - - Example usage:: - - cluster.update_head_node() - - - `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. - - Example usage:: - - job_address = cluster.get_job_address() - - Example - ------- - - Here's an example of how to use the `SLURMCluster` class:: - - .. code-block:: python - import Xorbits - from xorbits.deploy.slurm import SLURMCluster - - test_cluster = SLURMCluster( - job_name="xorbits", - num_nodes=2, - output_path="/shared_space/output.out", - time="00:30:00", - ) - address = test_cluster.run() - xorbits.init(address) - assert (pd.Series([1, 2, 3]).sum()) == "6" +.. code:: python + + Initialization + -------------- + + To create an instance of the `SLURMCluster` class, you can use the following parameters: + + - `job_name` (str, optional): Name of the Slurm job. + - `num_nodes` (int, optional): Number of nodes in the Slurm cluster. + - `partition_option` (str, optional): Request a specific partition for resource allocation. + - `load_env` (str, optional): Conda Environment to load. + - `output_path` (str, optional): Path for log output. + - `error_path` (str, optional): Path for log errors. + - `work_dir` (str, optional): Slurm's working directory, the default location for logs and results. + - `time` (str, optional): Minimum time limit for job allocation. + - `processes` (int, optional): Number of processes. + - `cores` (int, optional): Number of cores. + - `memory` (str, optional): Specify the real memory required per node. Default units are megabytes. + - `account` (str, optional): Charge resources used by this job to the specified account. + - `webport` (int, optional): Xorbits' web port. + - `**kwargs`: Additional parameters that can be added using the Slurm interface. + + Example usage:: + + from xorbits.deploy.slurm import SLURMCluster + + cluster = SLURMCluster( + job_name="my_job", + num_nodes=4, + partition_option="compute", + load_env="my_env", + output_path="logs/output.log", + error_path="logs/error.log", + work_dir="/path/to/work_dir", + time="1:00:00", + processes=8, + cores=2, + memory="8G", + account="my_account", + webport=16379, + custom_param1="value1", + custom_param2="value2" + ) + + .. note:: + Modify the parameters as needed for your specific use case. + + Running the Job + --------------- + + To submit the job to SLURM, use the `run()` method. It will return the job's address. + + Example usage:: + + address = cluster.run() + + Getting Job Information + ----------------------- + + - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. + + Example usage:: + + job_id = cluster.get_job_id() + + - `cancel_job()`: This method cancels the job using the `scancel` command. A hook is designed so that while canceling the program, the Slurm task will also be canceled. + + Example usage:: + + cluster.cancel_job(job_id) + + - `update_head_node()`: This method retrieves the head node information from the SLURM job. + + Example usage:: + + cluster.update_head_node() + + - `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. + + Example usage:: + + job_address = cluster.get_job_address() + + Example + ------- + + Here's an example of how to use the `SLURMCluster` class:: + + .. code-block:: python + + import pandas as pd + from xorbits.deploy.slurm import SLURMCluster + + test_cluster = SLURMCluster( + job_name="xorbits", + num_nodes=2, + output_path="/shared_space/output.out", + time="00:30:00", + ) + address = test_cluster.run() + xorbits.init(address) + assert pd.Series([1, 2, 3]).sum() == 6 From 8f80e7191df6d8b5a4cea8b37e7059d2c41207f8 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 21:30:36 -0700 Subject: [PATCH 42/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 4c93b45d8..da8a51da1 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -2,10 +2,12 @@ ================== SLURM deployment -================== +********** Script Method -********** +================== + + If you have access to a SLURM cluster, you can refer to the following guide to run an Xorbits job. Other HPC job schedulers like Torque or LSF are similar. You are recommended to read the :ref:`cluster deployment ` first to know some basic knowledge of a Xorbits cluster. @@ -222,10 +224,8 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" - - -.. code:: python - +Code Method +================== Initialization -------------- From 678a317ae6169de257b825b7aaffd3ab21c665d2 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 21:32:09 -0700 Subject: [PATCH 43/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index da8a51da1..acb31e7b8 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -1,9 +1,10 @@ .. _deployment_slurm: -================== + SLURM deployment ********** +================== Script Method ================== @@ -224,9 +225,12 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" +================== Code Method ================== - Initialization + + +Initialization -------------- To create an instance of the `SLURMCluster` class, you can use the following parameters: From 5247784667bfecdf913a6df087231e72a2598d63 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 21:35:42 -0700 Subject: [PATCH 44/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index acb31e7b8..152820016 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -250,7 +250,7 @@ Initialization - `webport` (int, optional): Xorbits' web port. - `**kwargs`: Additional parameters that can be added using the Slurm interface. - Example usage:: + .. code-block:: python from xorbits.deploy.slurm import SLURMCluster @@ -289,25 +289,25 @@ Initialization - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. - Example usage:: + .. code-block:: python job_id = cluster.get_job_id() - `cancel_job()`: This method cancels the job using the `scancel` command. A hook is designed so that while canceling the program, the Slurm task will also be canceled. - Example usage:: + .. code-block:: python cluster.cancel_job(job_id) - `update_head_node()`: This method retrieves the head node information from the SLURM job. - Example usage:: + .. code-block:: python cluster.update_head_node() - `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. - Example usage:: + .. code-block:: python job_address = cluster.get_job_address() From 57b51b19bb9dd97503eda25150fd34a9ec39dbfe Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 21:41:12 -0700 Subject: [PATCH 45/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 152820016..847ded8be 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -231,7 +231,7 @@ Code Method Initialization - -------------- +-------------- To create an instance of the `SLURMCluster` class, you can use the following parameters: @@ -275,17 +275,17 @@ Initialization .. note:: Modify the parameters as needed for your specific use case. - Running the Job - --------------- +Running the Job +--------------- To submit the job to SLURM, use the `run()` method. It will return the job's address. - Example usage:: + .. code-block:: python address = cluster.run() - Getting Job Information - ----------------------- +Getting Job Information +----------------------- - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. @@ -311,8 +311,8 @@ Initialization job_address = cluster.get_job_address() - Example - ------- +Example +------- Here's an example of how to use the `SLURMCluster` class:: From 91419c968f21805d7b2887da0c905a460ab0a547 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 21:43:30 -0700 Subject: [PATCH 46/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 847ded8be..d14c3a94c 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -231,7 +231,7 @@ Code Method Initialization --------------- +~~~~~~~~~~~~~~ To create an instance of the `SLURMCluster` class, you can use the following parameters: @@ -276,7 +276,7 @@ Initialization Modify the parameters as needed for your specific use case. Running the Job ---------------- +~~~~~~~~~~~~~~~ To submit the job to SLURM, use the `run()` method. It will return the job's address. @@ -285,7 +285,7 @@ Running the Job address = cluster.run() Getting Job Information ------------------------ +~~~~~~~~~~~~~~~~~~~~~~~ - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. @@ -312,7 +312,7 @@ Getting Job Information job_address = cluster.get_job_address() Example -------- +~~~~~~~ Here's an example of how to use the `SLURMCluster` class:: From a0d063366f8d7a52bc799387484da2426712e632 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 22:01:18 -0700 Subject: [PATCH 47/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index d14c3a94c..6ab664a51 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -2,11 +2,11 @@ SLURM deployment -********** +**************** -================== +============= Script Method -================== +============= @@ -225,9 +225,9 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" -================== +=========== Code Method -================== +=========== Initialization From deabf527451320afd9dc0a7c8b8c8dfb2f4c42d2 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 22:02:43 -0700 Subject: [PATCH 48/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 6ab664a51..9383a3df0 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -4,7 +4,6 @@ SLURM deployment **************** -============= Script Method ============= @@ -225,7 +224,6 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" -=========== Code Method =========== From e70d2031eddf575315fe90523515e952068c428d Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 22:52:00 -0700 Subject: [PATCH 49/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 9383a3df0..f546ae545 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -229,7 +229,7 @@ Code Method Initialization -~~~~~~~~~~~~~~ +---------------- To create an instance of the `SLURMCluster` class, you can use the following parameters: @@ -274,7 +274,7 @@ Initialization Modify the parameters as needed for your specific use case. Running the Job -~~~~~~~~~~~~~~~ +---------------- To submit the job to SLURM, use the `run()` method. It will return the job's address. @@ -283,7 +283,7 @@ Running the Job address = cluster.run() Getting Job Information -~~~~~~~~~~~~~~~~~~~~~~~ +---------------- - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. @@ -310,7 +310,7 @@ Getting Job Information job_address = cluster.get_job_address() Example -~~~~~~~ +---------------- Here's an example of how to use the `SLURMCluster` class:: From 940f331160f5e25e08bbb83985f465cf436b4e4e Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 22:54:56 -0700 Subject: [PATCH 50/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index f546ae545..3a51581db 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -2,13 +2,9 @@ SLURM deployment -**************** - -Script Method ============= - If you have access to a SLURM cluster, you can refer to the following guide to run an Xorbits job. Other HPC job schedulers like Torque or LSF are similar. You are recommended to read the :ref:`cluster deployment ` first to know some basic knowledge of a Xorbits cluster. @@ -38,6 +34,10 @@ The below walkthrough will do the following: 6. After the underlying Xorbits cluster is ready, submit the user-specified task. + +Script Method +-------------- + SLURM script file ~~~~~~~~~~~~~~~~~ @@ -224,12 +224,13 @@ The SLURM script looks like this: python -u test.py --endpoint "${address}" + Code Method -=========== +----------- Initialization ----------------- +~~~~~~~~~~~~~~ To create an instance of the `SLURMCluster` class, you can use the following parameters: @@ -274,7 +275,7 @@ Initialization Modify the parameters as needed for your specific use case. Running the Job ----------------- +~~~~~~~~~~~~~~~ To submit the job to SLURM, use the `run()` method. It will return the job's address. @@ -310,7 +311,7 @@ Getting Job Information job_address = cluster.get_job_address() Example ----------------- +~~~~~~~ Here's an example of how to use the `SLURMCluster` class:: From 6a931e5692eb91b3df9c6d969d673ab577a72518 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:05:34 -0700 Subject: [PATCH 51/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 45 +++++++++++----------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 3a51581db..f47a59110 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -173,7 +173,7 @@ Name this SLURM script file as ``xorbits_slurm.sh``. Submit the job via: Put all together ----------------- +~~~~~~~~~~~~~~~~~~~~~~ The SLURM script looks like this: @@ -249,8 +249,8 @@ Initialization - `webport` (int, optional): Xorbits' web port. - `**kwargs`: Additional parameters that can be added using the Slurm interface. - .. code-block:: python +.. code-block:: python from xorbits.deploy.slurm import SLURMCluster cluster = SLURMCluster( @@ -271,51 +271,52 @@ Initialization custom_param2="value2" ) - .. note:: - Modify the parameters as needed for your specific use case. +.. note:: + Modify the parameters as needed for your specific use case. Running the Job ~~~~~~~~~~~~~~~ - To submit the job to SLURM, use the `run()` method. It will return the job's address. +To submit the job to SLURM, use the `run()` method. It will return the job's address. - .. code-block:: python +.. code-block:: python - address = cluster.run() + address = cluster.run() Getting Job Information ----------------- +---------------------- - - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. +- `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. - .. code-block:: python +.. code-block:: python - job_id = cluster.get_job_id() +job_id = cluster.get_job_id() - - `cancel_job()`: This method cancels the job using the `scancel` command. A hook is designed so that while canceling the program, the Slurm task will also be canceled. +- `cancel_job()`: This method cancels the job using the `scancel` command. A hook is designed so that while canceling the program, the Slurm task will also be canceled. - .. code-block:: python +.. code-block:: python - cluster.cancel_job(job_id) +cluster.cancel_job(job_id) - - `update_head_node()`: This method retrieves the head node information from the SLURM job. +- `update_head_node()`: This method retrieves the head node information from the SLURM job. - .. code-block:: python +.. code-block:: python + + cluster.update_head_node() - cluster.update_head_node() +- `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. - - `get_job_address(retry_attempts=10, sleep_interval=30)`: This method retrieves the job address after deployment. It retries several times to get the job data. +.. code-block:: python - .. code-block:: python + job_address = cluster.get_job_address() - job_address = cluster.get_job_address() Example ~~~~~~~ - Here's an example of how to use the `SLURMCluster` class:: +Here's an example of how to use the `SLURMCluster` class:: - .. code-block:: python +.. code-block:: python import pandas as pd from xorbits.deploy.slurm import SLURMCluster From e89053d8a670a2f862a24a115283a53a155c0cb2 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:07:48 -0700 Subject: [PATCH 52/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index f47a59110..f827b8363 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -284,7 +284,8 @@ To submit the job to SLURM, use the `run()` method. It will return the job's add address = cluster.run() Getting Job Information ----------------------- +~~~~~~~~~~~~~~~~~~~~~~~~ + - `get_job_id()`: This method extracts the job ID from the output of the `sbatch` command. From fbd4e02a90f576d795c4c57afa92e067543c6c06 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:10:37 -0700 Subject: [PATCH 53/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 23 +++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index f827b8363..ea753f31c 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -232,7 +232,7 @@ Code Method Initialization ~~~~~~~~~~~~~~ - To create an instance of the `SLURMCluster` class, you can use the following parameters: +To create an instance of the `SLURMCluster` class, you can use the following parameters: - `job_name` (str, optional): Name of the Slurm job. - `num_nodes` (int, optional): Number of nodes in the Slurm cluster. @@ -251,9 +251,8 @@ Initialization .. code-block:: python - from xorbits.deploy.slurm import SLURMCluster - - cluster = SLURMCluster( + from xorbits.deploy.slurm import SLURMCluster + cluster = SLURMCluster( job_name="my_job", num_nodes=4, partition_option="compute", @@ -269,7 +268,8 @@ Initialization webport=16379, custom_param1="value1", custom_param2="value2" - ) + ) + .. note:: Modify the parameters as needed for your specific use case. @@ -318,17 +318,16 @@ Example Here's an example of how to use the `SLURMCluster` class:: .. code-block:: python + import pandas as pd + from xorbits.deploy.slurm import SLURMCluster - import pandas as pd - from xorbits.deploy.slurm import SLURMCluster - - test_cluster = SLURMCluster( + test_cluster = SLURMCluster( job_name="xorbits", num_nodes=2, output_path="/shared_space/output.out", time="00:30:00", ) - address = test_cluster.run() - xorbits.init(address) - assert pd.Series([1, 2, 3]).sum() == 6 + address = test_cluster.run() + xorbits.init(address) + assert pd.Series([1, 2, 3]).sum() == 6 From b89d7b2b0049c9aac7d9e2bfb9b65440e93054d2 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:14:39 -0700 Subject: [PATCH 54/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index ea753f31c..963b71c96 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -291,13 +291,13 @@ Getting Job Information .. code-block:: python -job_id = cluster.get_job_id() + job_id = cluster.get_job_id() - `cancel_job()`: This method cancels the job using the `scancel` command. A hook is designed so that while canceling the program, the Slurm task will also be canceled. .. code-block:: python -cluster.cancel_job(job_id) + cluster.cancel_job(job_id) - `update_head_node()`: This method retrieves the head node information from the SLURM job. @@ -318,6 +318,7 @@ Example Here's an example of how to use the `SLURMCluster` class:: .. code-block:: python + import pandas as pd from xorbits.deploy.slurm import SLURMCluster From 2bb54b3975b218ca892174e692eb71ab886bba63 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:19:22 -0700 Subject: [PATCH 55/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 963b71c96..7b6111047 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -251,6 +251,7 @@ To create an instance of the `SLURMCluster` class, you can use the following par .. code-block:: python + from xorbits.deploy.slurm import SLURMCluster cluster = SLURMCluster( job_name="my_job", From d0bc341ca14e6e06b001a0eab3cbf01fc506fe22 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:23:48 -0700 Subject: [PATCH 56/59] Update deployment_slurm.rst Signed-off-by: liddle rain From 52dc1240c041066917f850514f17d29838dd1b56 Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:24:59 -0700 Subject: [PATCH 57/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index 7b6111047..dc2d5fe79 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -316,7 +316,7 @@ Getting Job Information Example ~~~~~~~ -Here's an example of how to use the `SLURMCluster` class:: +Here's an example of how to use the `SLURMCluster` class .. code-block:: python From e390033cc40995612380b19fed36764a7e5a420f Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:26:28 -0700 Subject: [PATCH 58/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index dc2d5fe79..f1ada3b9c 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -316,6 +316,7 @@ Getting Job Information Example ~~~~~~~ + Here's an example of how to use the `SLURMCluster` class .. code-block:: python From a0da96245fd61c8af142198e63c77bb0edf6a19b Mon Sep 17 00:00:00 2001 From: liddle rain Date: Mon, 30 Oct 2023 23:34:15 -0700 Subject: [PATCH 59/59] Update deployment_slurm.rst Signed-off-by: liddle rain --- doc/source/user_guide/deployment_slurm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/deployment_slurm.rst b/doc/source/user_guide/deployment_slurm.rst index f1ada3b9c..491c0e22c 100644 --- a/doc/source/user_guide/deployment_slurm.rst +++ b/doc/source/user_guide/deployment_slurm.rst @@ -310,7 +310,7 @@ Getting Job Information .. code-block:: python - job_address = cluster.get_job_address() + job_address = cluster.get_job_address(retry_attempts=10, sleep_interval=30) Example