Skip to content

Commit

Permalink
more examples and launch_copper.sh for aurora
Browse files Browse the repository at this point in the history
  • Loading branch information
kaushikvelusamy committed Sep 26, 2024
1 parent 9849b09 commit f61ec69
Show file tree
Hide file tree
Showing 13 changed files with 299 additions and 145 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,5 @@ set(SHUTDOWN_SOURCES src/copper/rpc_shutdown.cpp)
add_executable(${SHUTDOWN_PROJECT_NAME} ${SHUTDOWN_SOURCES})
target_link_libraries(${SHUTDOWN_PROJECT_NAME} PRIVATE PkgConfig::MARGO PkgConfig::THALLIUM)

install(TARGETS ${PROJECT_NAME} DESTINATION bin)
install(TARGETS ${PROJECT_NAME} DESTINATION bin)
install(FILES scripts/launch_copper.sh ${PROJECT_NAME} DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
22 changes: 7 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,17 @@ More documentation can be found here: [readthedocs](https://alcf-copper-docs.rea
### How to load the copper package on Aurora

```bash
module load spack-pe-oneapi copper
module load copper
```


### How to start the copper service

```bash
CUPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper-spack-recipe/gitrepos/copper/build
CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary
LOGDIR=~/copper-logs/${PBS_JOBID}
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper
rm -rf ~/copper_logs*
mkdir -p ${LOGDIR}
clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}"
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}"

read -r -d '' CMD << EOM
Expand All @@ -49,7 +47,7 @@ read -r -d '' CMD << EOM
-tpath / # / will be mounted under CU_FUSE_MNT_VIEWDIR
-vpath ${CU_FUSE_MNT_VIEWDIR} # To provide the fuse mounted location
-log_output_dir ${LOGDIR} # To provide where the copper logs will be stored
-log_level 6 # To provide the level of copper logging
-log_level 6 # To provide the level of copper logging 6 more 0 less
-log_type file # To direct logging to file / stdout / both
-net_type cxi # To provide the network protocol
-nf ${PBS_NODEFILE} # To provide the hostlist where cu_fuse will be mounted
Expand All @@ -68,16 +66,10 @@ clush --hostfile ${PBS_NODEFILE} $CMD # To start copper on all the com
### How to run your app with copper

```bash
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}"
module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024
module load frameworks/2024.1
conda deactivate
conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 #Start conda with the full copper path instead of the standard path
which python
CPU_BINDING=list:4:9:14:19:20:25:56:61:66:71:74:79
time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} --genvall --genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 python3 real_app.py

time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=${CPU_BINDING} --genvall \
--genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \
python3 -c "import torch; print(torch.__file__)"
```

### How to stop the copper service
Expand Down
93 changes: 0 additions & 93 deletions example/withcopper_aurora_job_script.sh

This file was deleted.

33 changes: 0 additions & 33 deletions example/withoutcopper_aurora_job_script.sh

This file was deleted.

58 changes: 58 additions & 0 deletions examples/example1/simple_with_copper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash -x
#PBS -l select=512
#PBS -l walltime=02:00:00
#PBS -A Aurora_deployment
#PBS -q lustre_scaling
#PBS -k doe

# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I

# This example shows loading python modules from a lustre directory with using copper.

cd $PBS_O_WORKDIR
echo Jobid: $PBS_JOBID
echo Running on nodes `cat $PBS_NODEFILE`

# starting copper section

module load copper
CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov}
mkdir -p ${LOGDIR} #only on head node
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes

read -r -d '' CMD << EOM
numactl --physcpubind="0-3"
$CUPATH
-tpath /
-vpath ${CU_FUSE_MNT_VIEWDIR}
-log_level 6
-log_type file
-log_output_dir ${LOGDIR}
-net_type cxi
-trees 1
-nf ${PBS_NODEFILE}
-max_cacheable_byte_size $((10*1024*1024))
-s ${CU_FUSE_MNT_VIEWDIR}
EOM

clush --hostfile ${PBS_NODEFILE} $CMD
sleep 20s # add 60s if you are running on more than 2k nodes

# end copper section


NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}"

# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script.
# module load python
# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy


time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \
--genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \
python3 -c "import torch; print(torch.__file__)"
28 changes: 28 additions & 0 deletions examples/example1/simple_without_copper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash -x
#PBS -l select=512
#PBS -l walltime=02:00:00
#PBS -A Aurora_deployment
#PBS -q lustre_scaling
#PBS -k doe

# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I

# This example shows loading python modules from a lustre directory without using copper.

cd $PBS_O_WORKDIR
echo Jobid: $PBS_JOBID
echo Running on nodes `cat $PBS_NODEFILE`

NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}"

# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script.
# module load python
# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy


time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \
--genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \
python3 -c "import torch; print(torch.__file__)"
3 changes: 0 additions & 3 deletions example/real_app.py → examples/example2/real_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,3 @@
import tensorflow.config.experimental
from pathlib import Path
# print(MPI.__file__)
# print(np.__file__)
# print(K.__file__)
# print(pd.__file__)
57 changes: 57 additions & 0 deletions examples/example2/withcopper_aurora_job_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash -x
#PBS -l select=512
#PBS -l walltime=02:00:00
#PBS -A Aurora_deployment
#PBS -q lustre_scaling
#PBS -k doe

# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I

# This example shows loading python modules from a lustre directory with using copper.

cd $PBS_O_WORKDIR
echo Jobid: $PBS_JOBID
echo Running on nodes `cat $PBS_NODEFILE`

# starting copper section
module load copper
CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov}
mkdir -p ${LOGDIR} #only on head node
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes

read -r -d '' CMD << EOM
numactl --physcpubind="0-3"
$CUPATH
-tpath /
-vpath ${CU_FUSE_MNT_VIEWDIR}
-log_level 6
-log_type file
-log_output_dir ${LOGDIR}
-net_type cxi
-trees 1
-nf ${PBS_NODEFILE}
-max_cacheable_byte_size $((10*1024*1024))
-s ${CU_FUSE_MNT_VIEWDIR}
EOM

clush --hostfile ${PBS_NODEFILE} $CMD
sleep 20s # add 60s if you are running on more than 2k nodes
# end copper section

# App section
NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}"

module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024
module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024
conda deactivate
conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 #start conda through the copper path
which python

time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \
--genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \
python3 real_app.py
26 changes: 26 additions & 0 deletions examples/example2/withoutcopper_aurora_job_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash -x
#PBS -l select=512
#PBS -l walltime=02:00:00
#PBS -A Aurora_deployment
#PBS -q lustre_scaling
#PBS -k doe

# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withoutcopper_aurora_job_script.sh # or - I

# This example shows loading python modules from a lustre directory (the standard way) without using copper.

cd $PBS_O_WORKDIR
echo Jobid: $PBS_JOBID
echo Running on nodes `cat $PBS_NODEFILE`
NNODES=`wc -l < $PBS_NODEFILE`
RANKS_PER_NODE=12
NRANKS=$(( NNODES * RANKS_PER_NODE ))
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}"

module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 # This is the location of the cloned copy of your custom conda environment on lustre
module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024
which python

time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \
--genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \
python3 real_app.py
31 changes: 31 additions & 0 deletions examples/example3/launch_copper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash -x

module load copper
CUPATH=$COPPER_ROOT/bin/cu_fuse

rm -rf ~/copper_logs*
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov}
mkdir -p ${LOGDIR} #only on head node

CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper
clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}"
clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}"
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes

read -r -d '' CMD << EOM
numactl --physcpubind="0-3"
$CUPATH
-tpath /
-vpath ${CU_FUSE_MNT_VIEWDIR}
-log_level 6
-log_type file
-log_output_dir ${LOGDIR}
-net_type cxi
-trees 1
-nf ${PBS_NODEFILE}
-max_cacheable_byte_size $((10*1024*1024))
-s ${CU_FUSE_MNT_VIEWDIR}
EOM

clush --hostfile ${PBS_NODEFILE} $CMD
sleep 20s # add 60s if you are running on more than 2k nodes
Loading

0 comments on commit f61ec69

Please sign in to comment.