-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
more examples and launch_copper.sh for aurora
- Loading branch information
1 parent
9849b09
commit f61ec69
Showing
13 changed files
with
299 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/bin/bash -x | ||
#PBS -l select=512 | ||
#PBS -l walltime=02:00:00 | ||
#PBS -A Aurora_deployment | ||
#PBS -q lustre_scaling | ||
#PBS -k doe | ||
|
||
# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I | ||
|
||
# This example shows loading python modules from a lustre directory with using copper. | ||
|
||
cd $PBS_O_WORKDIR | ||
echo Jobid: $PBS_JOBID | ||
echo Running on nodes `cat $PBS_NODEFILE` | ||
|
||
# starting copper section | ||
|
||
module load copper | ||
CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary | ||
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} | ||
mkdir -p ${LOGDIR} #only on head node | ||
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper | ||
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes | ||
|
||
read -r -d '' CMD << EOM | ||
numactl --physcpubind="0-3" | ||
$CUPATH | ||
-tpath / | ||
-vpath ${CU_FUSE_MNT_VIEWDIR} | ||
-log_level 6 | ||
-log_type file | ||
-log_output_dir ${LOGDIR} | ||
-net_type cxi | ||
-trees 1 | ||
-nf ${PBS_NODEFILE} | ||
-max_cacheable_byte_size $((10*1024*1024)) | ||
-s ${CU_FUSE_MNT_VIEWDIR} | ||
EOM | ||
|
||
clush --hostfile ${PBS_NODEFILE} $CMD | ||
sleep 20s # add 60s if you are running on more than 2k nodes | ||
|
||
# end copper section | ||
|
||
|
||
NNODES=`wc -l < $PBS_NODEFILE` | ||
RANKS_PER_NODE=12 | ||
NRANKS=$(( NNODES * RANKS_PER_NODE )) | ||
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" | ||
|
||
# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script. | ||
# module load python | ||
# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy | ||
|
||
|
||
time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ | ||
--genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ | ||
python3 -c "import torch; print(torch.__file__)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash -x | ||
#PBS -l select=512 | ||
#PBS -l walltime=02:00:00 | ||
#PBS -A Aurora_deployment | ||
#PBS -q lustre_scaling | ||
#PBS -k doe | ||
|
||
# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I | ||
|
||
# This example shows loading python modules from a lustre directory without using copper. | ||
|
||
cd $PBS_O_WORKDIR | ||
echo Jobid: $PBS_JOBID | ||
echo Running on nodes `cat $PBS_NODEFILE` | ||
|
||
NNODES=`wc -l < $PBS_NODEFILE` | ||
RANKS_PER_NODE=12 | ||
NRANKS=$(( NNODES * RANKS_PER_NODE )) | ||
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" | ||
|
||
# The below 2 lines are only for first time setup to install a package on a custom dir. Do not use in this job script. | ||
# module load python | ||
# pip install --target=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env numpy | ||
|
||
|
||
time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ | ||
--genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/kaushik/copper/july12/copper/run/copper_conda_env \ | ||
python3 -c "import torch; print(torch.__file__)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/bin/bash -x | ||
#PBS -l select=512 | ||
#PBS -l walltime=02:00:00 | ||
#PBS -A Aurora_deployment | ||
#PBS -q lustre_scaling | ||
#PBS -k doe | ||
|
||
# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withcopper_aurora_job_script.sh # or - I | ||
|
||
# This example shows loading python modules from a lustre directory with using copper. | ||
|
||
cd $PBS_O_WORKDIR | ||
echo Jobid: $PBS_JOBID | ||
echo Running on nodes `cat $PBS_NODEFILE` | ||
|
||
# starting copper section | ||
module load copper | ||
CUPATH=$COPPER_ROOT/bin/cu_fuse # If you are building copper on your own, set this path to your cu_fuse binary | ||
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} | ||
mkdir -p ${LOGDIR} #only on head node | ||
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper | ||
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes | ||
|
||
read -r -d '' CMD << EOM | ||
numactl --physcpubind="0-3" | ||
$CUPATH | ||
-tpath / | ||
-vpath ${CU_FUSE_MNT_VIEWDIR} | ||
-log_level 6 | ||
-log_type file | ||
-log_output_dir ${LOGDIR} | ||
-net_type cxi | ||
-trees 1 | ||
-nf ${PBS_NODEFILE} | ||
-max_cacheable_byte_size $((10*1024*1024)) | ||
-s ${CU_FUSE_MNT_VIEWDIR} | ||
EOM | ||
|
||
clush --hostfile ${PBS_NODEFILE} $CMD | ||
sleep 20s # add 60s if you are running on more than 2k nodes | ||
# end copper section | ||
|
||
# App section | ||
NNODES=`wc -l < $PBS_NODEFILE` | ||
RANKS_PER_NODE=12 | ||
NRANKS=$(( NNODES * RANKS_PER_NODE )) | ||
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" | ||
|
||
module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 | ||
module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 | ||
conda deactivate | ||
conda activate ${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 #start conda through the copper path | ||
which python | ||
|
||
time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ | ||
--genv=PYTHONPATH=${CU_FUSE_MNT_VIEWDIR}/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \ | ||
python3 real_app.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash -x | ||
#PBS -l select=512 | ||
#PBS -l walltime=02:00:00 | ||
#PBS -A Aurora_deployment | ||
#PBS -q lustre_scaling | ||
#PBS -k doe | ||
|
||
# qsub -l select=512:ncpus=208 -l walltime=02:00:00 -A Aurora_deployment -l filesystems=flare -q lustre_scaling ./withoutcopper_aurora_job_script.sh # or - I | ||
|
||
# This example shows loading python modules from a lustre directory (the standard way) without using copper. | ||
|
||
cd $PBS_O_WORKDIR | ||
echo Jobid: $PBS_JOBID | ||
echo Running on nodes `cat $PBS_NODEFILE` | ||
NNODES=`wc -l < $PBS_NODEFILE` | ||
RANKS_PER_NODE=12 | ||
NRANKS=$(( NNODES * RANKS_PER_NODE )) | ||
echo "App running on NUM_OF_NODES=${NNODES} TOTAL_NUM_RANKS=${NRANKS} RANKS_PER_NODE=${RANKS_PER_NODE}" | ||
|
||
module use /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 # This is the location of the cloned copy of your custom conda environment on lustre | ||
module load frameworks/2024.1 # This will start your conda environment at /lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 | ||
which python | ||
|
||
time mpirun --np ${NRANKS} --ppn ${RANKS_PER_NODE} --cpu-bind=list:4:9:14:19:20:25:56:61:66:71:74:79 --genvall \ | ||
--genv=PYTHONPATH=/lus/flare/projects/Aurora_deployment/copper-software-module/example_app/app-dependencies/sst_2024 \ | ||
python3 real_app.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/bin/bash -x | ||
|
||
module load copper | ||
CUPATH=$COPPER_ROOT/bin/cu_fuse | ||
|
||
rm -rf ~/copper_logs* | ||
LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} | ||
mkdir -p ${LOGDIR} #only on head node | ||
|
||
CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper | ||
clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" | ||
clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" | ||
clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes | ||
|
||
read -r -d '' CMD << EOM | ||
numactl --physcpubind="0-3" | ||
$CUPATH | ||
-tpath / | ||
-vpath ${CU_FUSE_MNT_VIEWDIR} | ||
-log_level 6 | ||
-log_type file | ||
-log_output_dir ${LOGDIR} | ||
-net_type cxi | ||
-trees 1 | ||
-nf ${PBS_NODEFILE} | ||
-max_cacheable_byte_size $((10*1024*1024)) | ||
-s ${CU_FUSE_MNT_VIEWDIR} | ||
EOM | ||
|
||
clush --hostfile ${PBS_NODEFILE} $CMD | ||
sleep 20s # add 60s if you are running on more than 2k nodes |
Oops, something went wrong.