From f61ec690651f4283b6c871701415e783e43ab557 Mon Sep 17 00:00:00 2001 From: kaushik Date: Thu, 26 Sep 2024 10:53:35 -0500 Subject: [PATCH] more examples and launch_copper.sh for aurora --- CMakeLists.txt | 3 +- README.md | 22 ++--- example/withcopper_aurora_job_script.sh | 93 ------------------- example/withoutcopper_aurora_job_script.sh | 33 ------- examples/example1/simple_with_copper.sh | 58 ++++++++++++ examples/example1/simple_without_copper.sh | 28 ++++++ {example => examples/example2}/real_app.py | 3 - .../example2/withcopper_aurora_job_script.sh | 57 ++++++++++++ .../withoutcopper_aurora_job_script.sh | 26 ++++++ examples/example3/launch_copper.sh | 31 +++++++ examples/example3/simple_with_copper.sh | 31 +++++++ examples/example3/simple_without_copper.sh | 28 ++++++ scripts/launch_copper.sh | 31 +++++++ 13 files changed, 299 insertions(+), 145 deletions(-) delete mode 100644 example/withcopper_aurora_job_script.sh delete mode 100644 example/withoutcopper_aurora_job_script.sh create mode 100644 examples/example1/simple_with_copper.sh create mode 100644 examples/example1/simple_without_copper.sh rename {example => examples/example2}/real_app.py (92%) create mode 100644 examples/example2/withcopper_aurora_job_script.sh create mode 100644 examples/example2/withoutcopper_aurora_job_script.sh create mode 100644 examples/example3/launch_copper.sh create mode 100644 examples/example3/simple_with_copper.sh create mode 100644 examples/example3/simple_without_copper.sh create mode 100644 scripts/launch_copper.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index fb907bef..e64ff402 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,4 +75,5 @@ set(SHUTDOWN_SOURCES src/copper/rpc_shutdown.cpp) add_executable(${SHUTDOWN_PROJECT_NAME} ${SHUTDOWN_SOURCES}) target_link_libraries(${SHUTDOWN_PROJECT_NAME} PRIVATE PkgConfig::MARGO PkgConfig::THALLIUM) -install(TARGETS ${PROJECT_NAME} DESTINATION bin) \ No newline at end of file +install(TARGETS ${PROJECT_NAME} DESTINATION bin) +install(FILES scripts/launch_copper.sh ${PROJECT_NAME} DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) diff --git a/README.md b/README.md index cdca2047..d9a59163 100644 --- a/README.md +++ b/README.md @@ -28,19 +28,17 @@ More documentation can be found here: [readthedocs](https://alcf-copper-docs.rea ### How to load the copper package on Aurora ```bash -module load spack-pe-oneapi copper +module load copper ``` ### How to start the copper service ```bash -CUPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper-spack-recipe/gitrepos/copper/build +CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary LOGDIR=~/copper-logs/${PBS_JOBID} CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper -rm -rf ~/copper_logs* mkdir -p ${LOGDIR} -clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" read -r -d '' CMD << EOM @@ -49,7 +47,7 @@ read -r -d '' CMD << EOM -tpath / # / will be mounted under CU_FUSE_MNT_VIEWDIR -vpath ${CU_FUSE_MNT_VIEWDIR} # To provide the fuse mounted location -log_output_dir ${LOGDIR} # To provide where the copper logs will be stored - -log_level 6 # To provide the level of copper logging + -log_level 6 # To provide the level of copper logging 6 more 0 less -log_type file # To direct logging to file / stdout / both -net_type cxi # To provide the network protocol -nf ${PBS_NODEFILE} # To provide the hostlist where cu_fuse will be mounted @@ -68,16 +66,10 @@ clush --hostfile ${PBS_NODEFILE} $CMD # To start copper on all the com ### How to run your app with copper ```bash -RANKS_PER_NODE=12 -NRANKS=$(( NNODES * RANKS_PER_NODE )) -echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" -module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 -module load frameworks/2024.1 -conda deactivate -conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 #Start conda with the full copper path instead of the standard path -which python -CPU_BINDING=list:4:9:14:19:20:25:56:61:66:71:74:79 -time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} --genvall --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 python3 real_app.py + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} --genvall \ + --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ + python3 -c "import torch; print(torch.__file__)" ``` ### How to stop the copper service diff --git a/example/withcopper_aurora_job_script.sh b/example/withcopper_aurora_job_script.sh deleted file mode 100644 index fe79fdb9..00000000 --- a/example/withcopper_aurora_job_script.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -x -#PBS -l select=512 -#PBS -l walltime=02:00:00 -#PBS -A Aurora_deployment -#PBS -q lustre_scaling -#PBS -k doe - -# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./pbs-script.sh or - I - -# This example shows loading python modules from a lustre directory with using copper. - -export TZ='/usr/share/zoneinfo/US/Central' -date -cd $PBS_O_WORKDIR -echo Jobid: $PBS_JOBID -echo Running on nodes `cat $PBS_NODEFILE` - - -# starting copper section -module load copper -CUPATH=$COPPER_ROOT/bin/cu_fuse -# CUPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper-spack-recipe/gitrepos/copper/build/cu_fuse - -NNODES=`wc -l < $PBS_NODEFILE` -RANKS_PER_NODE=1 -NRANKS=$(( NNODES * RANKS_PER_NODE )) -echo "Copper running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" -LOGDIR=~/copper-logs/${PBS_JOBID} -rm -rf ~/copper_logs* -mkdir -p ${LOGDIR} -CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper -clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" - -export FI_MR_ZE_CACHE_MONITOR_ENABLED=0 -export FI_MR_CACHE_MONITOR=disabled -export FI_CXI_RX_MATCH_MODE=hybrid -export FI_CXI_DEFAULT_CQ_SIZE=1048576 -export FI_CXI_CQ_FILL_PERCENT=30 -export MPI_PROVIDER=$FI_PROVIDER -unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE -export PALS_PING_PERIOD=240 -export PALS_RPC_TIMEOUT=240 - - - -read -r -d '' CMD << EOM - numactl --physcpubind="0-3" - $CUPATH - -tpath / - -vpath ${CU_FUSE_MNT_VIEWDIR} - -log_level 6 - -log_type file - -log_output_dir ${LOGDIR} - -net_type cxi - -trees 1 - -nf ${PBS_NODEFILE} - -max_cacheable_byte_size $((10*1024*1024)) - -s ${CU_FUSE_MNT_VIEWDIR} -EOM -# check with and without -f - running cu fuse on foreground vs background -clush --hostfile ${PBS_NODEFILE} $CMD -sleep 120s # Preferred to give sometime for copper service to be started on all nodes. -ls ${CU_FUSE_MNT_VIEWDIR} - - - -# App section -RANKS_PER_NODE=12 -NRANKS=$(( NNODES * RANKS_PER_NODE )) -echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" -date -module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 -date -module load frameworks/2024.1 -date -conda deactivate -conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 -which python -CPU_BINDING=list:4:9:14:19:20:25:56:61:66:71:74:79 - -date -time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} --genvall --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 python3 real_app.py -date - -# clean up copper -conda deactivate -clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" -export UID=$(id -u $USER) -clush --hostfile ${PBS_NODEFILE} "pkill -U $UID" diff --git a/example/withoutcopper_aurora_job_script.sh b/example/withoutcopper_aurora_job_script.sh deleted file mode 100644 index 18125c6f..00000000 --- a/example/withoutcopper_aurora_job_script.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -x -#PBS -l select=512 -#PBS -l walltime=02:00:00 -#PBS -A Aurora_deployment -#PBS -q lustre_scaling -#PBS -k doe - -# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./pbs-script.sh or - I - -# This example shows loading python modules from a lustre directory the standard way without using copper. - -export TZ='/usr/share/zoneinfo/US/Central' -date -cd $PBS_O_WORKDIR -echo Jobid: $PBS_JOBID -echo Running on nodes `cat $PBS_NODEFILE` - -NNODES=`wc -l < $PBS_NODEFILE` -RANKS_PER_NODE=12 -NRANKS=$(( NNODES * RANKS_PER_NODE )) -echo "NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" -date -module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 -date -module load frameworks/2024.1 -date -conda deactivate -conda activate /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 -which python -CPU_BINDING=list:4:9:14:19:20:25:56:61:66:71:74:79 -date -time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} python3 real_app.py -date diff --git a/examples/example1/simple_with_copper.sh b/examples/example1/simple_with_copper.sh new file mode 100644 index 00000000..cb06ae3d --- /dev/null +++ b/examples/example1/simple_with_copper.sh @@ -0,0 +1,58 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory with using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` + +# starting copper section + +module load copper +CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +mkdir -p ${LOGDIR} #only on head node +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes + +read -r -d '' CMD << EOM + numactl --physcpubind="0-3" + $CUPATH + -tpath / + -vpath ${CU_FUSE_MNT_VIEWDIR} + -log_level 6 + -log_type file + -log_output_dir ${LOGDIR} + -net_type cxi + -trees 1 + -nf ${PBS_NODEFILE} + -max_cacheable_byte_size $((10*1024*1024)) + -s ${CU_FUSE_MNT_VIEWDIR} +EOM + +clush --hostfile ${PBS_NODEFILE} $CMD +sleep 20s # add 60s if you are running on more than 2k nodes + +# end copper section + + +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script. +# module load python +# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy + + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ + python3 -c "import torch; print(torch.__file__)" diff --git a/examples/example1/simple_without_copper.sh b/examples/example1/simple_without_copper.sh new file mode 100644 index 00000000..11d32c86 --- /dev/null +++ b/examples/example1/simple_without_copper.sh @@ -0,0 +1,28 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory without using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` + +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script. +# module load python +# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy + + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ + python3 -c "import torch; print(torch.__file__)" diff --git a/example/real_app.py b/examples/example2/real_app.py similarity index 92% rename from example/real_app.py rename to examples/example2/real_app.py index dd0f2351..511331fb 100644 --- a/example/real_app.py +++ b/examples/example2/real_app.py @@ -26,6 +26,3 @@ import tensorflow.config.experimental from pathlib import Path # print(MPI.__file__) -# print(np.__file__) -# print(K.__file__) -# print(pd.__file__) diff --git a/examples/example2/withcopper_aurora_job_script.sh b/examples/example2/withcopper_aurora_job_script.sh new file mode 100644 index 00000000..0e3522cc --- /dev/null +++ b/examples/example2/withcopper_aurora_job_script.sh @@ -0,0 +1,57 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory with using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` + +# starting copper section +module load copper +CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +mkdir -p ${LOGDIR} #only on head node +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes + +read -r -d '' CMD << EOM + numactl --physcpubind="0-3" + $CUPATH + -tpath / + -vpath ${CU_FUSE_MNT_VIEWDIR} + -log_level 6 + -log_type file + -log_output_dir ${LOGDIR} + -net_type cxi + -trees 1 + -nf ${PBS_NODEFILE} + -max_cacheable_byte_size $((10*1024*1024)) + -s ${CU_FUSE_MNT_VIEWDIR} +EOM + +clush --hostfile ${PBS_NODEFILE} $CMD +sleep 20s # add 60s if you are running on more than 2k nodes +# end copper section + +# App section +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 +module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 +conda deactivate +conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 #start conda through the copper path +which python + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \ + python3 real_app.py diff --git a/examples/example2/withoutcopper_aurora_job_script.sh b/examples/example2/withoutcopper_aurora_job_script.sh new file mode 100644 index 00000000..3217a2fd --- /dev/null +++ b/examples/example2/withoutcopper_aurora_job_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withoutcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory (the standard way) without using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 # This is the location of the cloned copy of your custom conda environment on lustre +module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 +which python + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \ + python3 real_app.py diff --git a/examples/example3/launch_copper.sh b/examples/example3/launch_copper.sh new file mode 100644 index 00000000..c0b322aa --- /dev/null +++ b/examples/example3/launch_copper.sh @@ -0,0 +1,31 @@ +#!/bin/bash -x + +module load copper +CUPATH=$COPPER_ROOT/bin/cu_fuse + +rm -rf ~/copper_logs* +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +mkdir -p ${LOGDIR} #only on head node + +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes + +read -r -d '' CMD << EOM + numactl --physcpubind="0-3" + $CUPATH + -tpath / + -vpath ${CU_FUSE_MNT_VIEWDIR} + -log_level 6 + -log_type file + -log_output_dir ${LOGDIR} + -net_type cxi + -trees 1 + -nf ${PBS_NODEFILE} + -max_cacheable_byte_size $((10*1024*1024)) + -s ${CU_FUSE_MNT_VIEWDIR} +EOM + +clush --hostfile ${PBS_NODEFILE} $CMD +sleep 20s # add 60s if you are running on more than 2k nodes diff --git a/examples/example3/simple_with_copper.sh b/examples/example3/simple_with_copper.sh new file mode 100644 index 00000000..61286dff --- /dev/null +++ b/examples/example3/simple_with_copper.sh @@ -0,0 +1,31 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory with using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` + +launch_copper.sh +# Prepend /tmp/${USER}/copper/ to all your absolute paths if you want your I/O to go through copper (including PYTHON_PATH, CONDA_PREFIX, CONDA_ROOT and PATH) + +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +# The below 2 lines are only for the first time setup to install a package on a custom dir. Do not use in this job script +# module load python +# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy + + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=/tmp/${USER}/copper/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ + python3 -c "import torch; print(torch.__file__)" diff --git a/examples/example3/simple_without_copper.sh b/examples/example3/simple_without_copper.sh new file mode 100644 index 00000000..02f89bab --- /dev/null +++ b/examples/example3/simple_without_copper.sh @@ -0,0 +1,28 @@ +#!/bin/bash -x +#PBS -l select=512 +#PBS -l walltime=02:00:00 +#PBS -A Aurora_deployment +#PBS -q lustre_scaling +#PBS -k doe + +# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I + +# This example shows loading python modules from a lustre directory without using copper. + +cd $PBS_O_WORKDIR +echo Jobid: $PBS_JOBID +echo Running on nodes `cat $PBS_NODEFILE` + +NNODES=`wc -l < $PBS_NODEFILE` +RANKS_PER_NODE=12 +NRANKS=$(( NNODES * RANKS_PER_NODE )) +echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" + +# The below 2 lines are only for the first time setup to install a package on a custom dir. Do not use in this job script +# module load python +# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy + + +time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ + --genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ + python3 -c "import torch; print(torch.__file__)" diff --git a/scripts/launch_copper.sh b/scripts/launch_copper.sh new file mode 100644 index 00000000..c0b322aa --- /dev/null +++ b/scripts/launch_copper.sh @@ -0,0 +1,31 @@ +#!/bin/bash -x + +module load copper +CUPATH=$COPPER_ROOT/bin/cu_fuse + +rm -rf ~/copper_logs* +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +mkdir -p ${LOGDIR} #only on head node + +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes + +read -r -d '' CMD << EOM + numactl --physcpubind="0-3" + $CUPATH + -tpath / + -vpath ${CU_FUSE_MNT_VIEWDIR} + -log_level 6 + -log_type file + -log_output_dir ${LOGDIR} + -net_type cxi + -trees 1 + -nf ${PBS_NODEFILE} + -max_cacheable_byte_size $((10*1024*1024)) + -s ${CU_FUSE_MNT_VIEWDIR} +EOM + +clush --hostfile ${PBS_NODEFILE} $CMD +sleep 20s # add 60s if you are running on more than 2k nodes