diff --git a/2024_PDSW_Copper_paper.pdf b/2024_PDSW_Copper_paper.pdf new file mode 100644 index 00000000..6a544a43 Binary files /dev/null and b/2024_PDSW_Copper_paper.pdf differ diff --git a/CMakeLists.txt b/CMakeLists.txt index e64ff402..0a81d881 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,16 +64,14 @@ set(SOURCES src/cache/path_status_cache.cpp ) - add_executable(${PROJECT_NAME} ${SOURCES}) target_include_directories(${PROJECT_NAME} PRIVATE "${FUSE3_INCLUDE_DIR}" "${MPI_INCLUDE_PATH}" "${MARGO_INCLUDE_DIRS}" "${THALLIUM_INCLUDE_DIRS}") target_link_libraries(${PROJECT_NAME} PRIVATE ${MPI_CXX_LIBRARIES} PkgConfig::MARGO PkgConfig::THALLIUM fuse3 MPI::MPI_CXX) - set(SHUTDOWN_PROJECT_NAME cu_fuse_shutdown) set(SHUTDOWN_SOURCES src/copper/rpc_shutdown.cpp) add_executable(${SHUTDOWN_PROJECT_NAME} ${SHUTDOWN_SOURCES}) target_link_libraries(${SHUTDOWN_PROJECT_NAME} PRIVATE PkgConfig::MARGO PkgConfig::THALLIUM) install(TARGETS ${PROJECT_NAME} DESTINATION bin) -install(FILES scripts/launch_copper.sh ${PROJECT_NAME} DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) +install(PROGRAMS scripts/launch_copper.sh DESTINATION bin) diff --git a/examples/example3/launch_copper.sh b/examples/example3/launch_copper.sh index c0b322aa..a672ebb8 100644 --- a/examples/example3/launch_copper.sh +++ b/examples/example3/launch_copper.sh @@ -1,31 +1,60 @@ #!/bin/bash -x -module load copper +log_level=6 +log_type="file" +trees=1 +max_cacheable_byte_size=$((10*1024*1024)) +sleeptime=20 +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +rm -rf ~/copper_logs* CUPATH=$COPPER_ROOT/bin/cu_fuse +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +physcpubind="48-51" -rm -rf ~/copper_logs* -LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} -mkdir -p ${LOGDIR} #only on head node -CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper -clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes +while getopts "l:t:T:M:s:b:" opt; do + case ${opt} in + l ) log_level=$OPTARG ;; + t ) log_type=$OPTARG ;; + T ) trees=$OPTARG ;; + M ) max_cacheable_byte_size=$OPTARG ;; + s ) sleeptime=$OPTARG ;; + b ) physcpubind=$OPTARG ;; + \? ) echo "Usage: cmd [-l] [-t] [-T] [-M] [-s] [-b]" ;; + esac +done + +echo "log_level : ${log_level}" +echo "log_type : ${log_type}" +echo "trees : ${trees}" +echo "max_cacheable_byte_size : ${max_cacheable_byte_size}" +echo "sleeptime : ${sleeptime}" +echo "CU_FUSE_MNT_VIEWDIR : ${CU_FUSE_MNT_VIEWDIR}" +echo "LOGDIR : ${LOGDIR}" +echo "PBS_NODEFILE : ${PBS_NODEFILE}" +echo "physcpubind : ${physcpubind}" + + + +mkdir -p "${LOGDIR}" #only on head node +clush --hostfile "${PBS_NODEFILE}" "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile "${PBS_NODEFILE}" "rm -rf ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile "${PBS_NODEFILE}" "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes read -r -d '' CMD << EOM - numactl --physcpubind="0-3" + numactl --physcpubind=${physcpubind} $CUPATH -tpath / -vpath ${CU_FUSE_MNT_VIEWDIR} - -log_level 6 - -log_type file + -log_level ${log_level} + -log_type ${log_type} -log_output_dir ${LOGDIR} -net_type cxi - -trees 1 + -trees ${trees} -nf ${PBS_NODEFILE} - -max_cacheable_byte_size $((10*1024*1024)) + -max_cacheable_byte_size ${max_cacheable_byte_size} -s ${CU_FUSE_MNT_VIEWDIR} EOM -clush --hostfile ${PBS_NODEFILE} $CMD -sleep 20s # add 60s if you are running on more than 2k nodes +clush --hostfile "${PBS_NODEFILE}" "$CMD" +sleep "${sleeptime}"s # add 60s if you are running on more than 2k nodes \ No newline at end of file diff --git a/examples/example3/simple_with_copper.sh b/examples/example3/simple_with_copper.sh index 61286dff..ae0e2272 100644 --- a/examples/example3/simple_with_copper.sh +++ b/examples/example3/simple_with_copper.sh @@ -13,6 +13,7 @@ cd $PBS_O_WORKDIR echo Jobid: $PBS_JOBID echo Running on nodes `cat $PBS_NODEFILE` +module load copper launch_copper.sh # Prepend /tmp/${USER}/copper/ to all your absolute paths if you want your I/O to go through copper (including PYTHON_PATH, CONDA_PREFIX, CONDA_ROOT and PATH) diff --git a/scripts/build_helper/build.sh b/scripts/build_helper/build.sh index 2b604aae..aa8288ec 100755 --- a/scripts/build_helper/build.sh +++ b/scripts/build_helper/build.sh @@ -35,5 +35,5 @@ cp build/compile_commands.json . || { echo "Failed to copy compile commands"; ex cd build || { echo "Failed to move to build dir"; exit 1; } make || { echo "Failed to build cu_fuse"; exit 1; } -# make install +make install diff --git a/scripts/launch_copper.sh b/scripts/launch_copper.sh index c0b322aa..a672ebb8 100644 --- a/scripts/launch_copper.sh +++ b/scripts/launch_copper.sh @@ -1,31 +1,60 @@ #!/bin/bash -x -module load copper +log_level=6 +log_type="file" +trees=1 +max_cacheable_byte_size=$((10*1024*1024)) +sleeptime=20 +LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} +rm -rf ~/copper_logs* CUPATH=$COPPER_ROOT/bin/cu_fuse +CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper +physcpubind="48-51" -rm -rf ~/copper_logs* -LOGDIR=~/copper-logs/${PBS_JOBID%%.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov} -mkdir -p ${LOGDIR} #only on head node -CU_FUSE_MNT_VIEWDIR=/tmp/${USER}/copper -clush --hostfile ${PBS_NODEFILE} "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "rm -rf ${CU_FUSE_MNT_VIEWDIR}" -clush --hostfile ${PBS_NODEFILE} "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes +while getopts "l:t:T:M:s:b:" opt; do + case ${opt} in + l ) log_level=$OPTARG ;; + t ) log_type=$OPTARG ;; + T ) trees=$OPTARG ;; + M ) max_cacheable_byte_size=$OPTARG ;; + s ) sleeptime=$OPTARG ;; + b ) physcpubind=$OPTARG ;; + \? ) echo "Usage: cmd [-l] [-t] [-T] [-M] [-s] [-b]" ;; + esac +done + +echo "log_level : ${log_level}" +echo "log_type : ${log_type}" +echo "trees : ${trees}" +echo "max_cacheable_byte_size : ${max_cacheable_byte_size}" +echo "sleeptime : ${sleeptime}" +echo "CU_FUSE_MNT_VIEWDIR : ${CU_FUSE_MNT_VIEWDIR}" +echo "LOGDIR : ${LOGDIR}" +echo "PBS_NODEFILE : ${PBS_NODEFILE}" +echo "physcpubind : ${physcpubind}" + + + +mkdir -p "${LOGDIR}" #only on head node +clush --hostfile "${PBS_NODEFILE}" "fusermount3 -u ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile "${PBS_NODEFILE}" "rm -rf ${CU_FUSE_MNT_VIEWDIR}" +clush --hostfile "${PBS_NODEFILE}" "mkdir -p ${CU_FUSE_MNT_VIEWDIR}" # on all compute nodes read -r -d '' CMD << EOM - numactl --physcpubind="0-3" + numactl --physcpubind=${physcpubind} $CUPATH -tpath / -vpath ${CU_FUSE_MNT_VIEWDIR} - -log_level 6 - -log_type file + -log_level ${log_level} + -log_type ${log_type} -log_output_dir ${LOGDIR} -net_type cxi - -trees 1 + -trees ${trees} -nf ${PBS_NODEFILE} - -max_cacheable_byte_size $((10*1024*1024)) + -max_cacheable_byte_size ${max_cacheable_byte_size} -s ${CU_FUSE_MNT_VIEWDIR} EOM -clush --hostfile ${PBS_NODEFILE} $CMD -sleep 20s # add 60s if you are running on more than 2k nodes +clush --hostfile "${PBS_NODEFILE}" "$CMD" +sleep "${sleeptime}"s # add 60s if you are running on more than 2k nodes \ No newline at end of file