From a905346a7695be3d93378d14cdeefce6c848c873 Mon Sep 17 00:00:00 2001 From: ragansu Date: Wed, 21 Aug 2024 05:52:36 -0700 Subject: [PATCH 1/3] optimising and reworking final touches with srun --- scripts/full_chain_post_process.sh | 104 +++++++++++++---------------- scripts/run_merger.sh | 41 ++++++++++++ 2 files changed, 86 insertions(+), 59 deletions(-) create mode 100755 scripts/run_merger.sh diff --git a/scripts/full_chain_post_process.sh b/scripts/full_chain_post_process.sh index 44b68c1..0968ff7 100755 --- a/scripts/full_chain_post_process.sh +++ b/scripts/full_chain_post_process.sh @@ -1,71 +1,57 @@ #!/bin/bash -#SBATCH --image=rootproject/root:6.28.04-ubuntu22.04 +#SBATCH --image=ragansu/fair-universe-data:test python3 #SBATCH --account=m4287 -#SBATCH --qos=shared -#SBATCH --tasks-per-node=1 +#SBATCH --qos=regular +#SBATCH -N 1 #SBATCH --constraint=cpu #SBATCH -t 4:00:00 #SBATCH -J Final_touches - now=$(date +"%Y%m%d") -working_dir=/global/cfs/cdirs/m4287/hep -merged_dir=$working_dir/Delphes_PYTHIA8_output/Merged_files -output_dir=$working_dir/Delphes_PYTHIA8_output/Full_data_files_$now +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -p|--public-train-factor) + public_train_factor=$2 + shift 2 + ;; + -t|--public-test-factor) + public_test_factor=$2 + shift 2 + ;; + -f|--test-factor) + test_factor=$2 + shift 2 + ;; + -l|--luminocity) + luminocity=$2 + shift 2 + ;; + *) + echo "Invalid argument: $1" + exit 1 + ;; + esac +done + +working_dir=/global/cfs/cdirs/m4287/hep/genHEPdata +data_dir=$working_dir/NEW_DelphesPythia_data +merged_dir=$data_dir/Merged_files +output_dir=$data_dir/Full_data_files_$now +Processes=("ttbar" "ztautau" "htautau" "diboson") +# Use srun to execute the script with Shifter +srun -n 4 -c 64 shifter bash -c " + # Define the array inside the shifter environment + Processes=('ttbar' 'ztautau' 'htautau' 'diboson') -for Process in ttbar htautau diboson ztautau -do - { - - input_dir=$working_dir/Delphes_PYTHIA8_output/csv_files_$Process - # Pattern to match the ROOT files (e.g., *.root) - FILES="$input_dir/*.root" - - # Output directory for the merged files - OUTPUT_DIR="$input_dir/merged_files" - mkdir -p "$OUTPUT_DIR" - - # Number of files to merge per batch - BATCH_SIZE=1000 - - # Initialize the batch counter - batch_num=1 - - # Loop over the files in batches of BATCH_SIZE - for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do - # Create a list of files for the current batch - file_list=$(ls -1 $FILES | sed -n "$((i + 1)),$((i + BATCH_SIZE))p") - - # Create the output file name for the current batch - output_file="$OUTPUT_DIR/merged_${batch_num}.root" - - # Run hadd to merge the current batch of files - shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd -f "$output_file" $file_list - - # Increment the batch counter - batch_num=$((batch_num + 1)) - done - - rm $input_dir/$Process.root - shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd ${input_dir}/${Process}.root "$OUTPUT_DIR/merged_*.root" - - # This script will merge the csv files generated by the FullDataGenerator into a single file. - shifter --image=nersc/fair_universe:1298f0a8 python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p - - shifter --image=rootproject/root:6.28.04-ubuntu22.04 python $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process - - } -done + # Access the correct process based on SLURM_PROCID + process=\${Processes[\$SLURM_PROCID]} + input_dir=$data_dir/csv_files_$process + # Run the test script with the appropriate arguments + $WorkDir/scripts/run_merger.sh \${process} ${input_dir} ${merged_dir} +" -shifter --image=nersc/fair_universe:1298f0a8 python3 Final_touches.py \ ---input $merged_dir \ ---output $output_dir \ ---input-format "parquet" \ ---output-format "parquet" \ ---derived-quantities \ ---test-factor 10 \ ---public-train-factor 34 \ ---public-test-factor 5 \ No newline at end of file +srun -n 1 c 256 shifter python3 Final_touches.py --input $merged_dir --output $output_dir --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_train_factor} --luminocity ${luminocity} diff --git a/scripts/run_merger.sh b/scripts/run_merger.sh new file mode 100755 index 0000000..8ae2cec --- /dev/null +++ b/scripts/run_merger.sh @@ -0,0 +1,41 @@ + +Process=$1 +input_dir=$2 +merged_dir=$3 + +# Pattern to match the ROOT files (e.g., *.root) +FILES="$input_dir/*.root" + +# Output directory for the merged files +OUTPUT_DIR="$input_dir/merged_files" +mkdir -p "$OUTPUT_DIR" + +# Number of files to merge per batch +BATCH_SIZE=1000 + +# Initialize the batch counter +batch_num=1 + +# Loop over the files in batches of BATCH_SIZE +for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do + # Create a list of files for the current batch + file_list=$(ls -1 $FILES | sed -n "$((i + 1)),$((i + BATCH_SIZE))p") + + # Create the output file name for the current batch + output_file="$OUTPUT_DIR/merged_${batch_num}.root" + + # Run hadd to merge the current batch of files + hadd -f "$output_file" $file_list + + # Increment the batch counter + batch_num=$((batch_num + 1)) +done + +rm $input_dir/$Process.root +hadd ${input_dir}/${Process}.root "$OUTPUT_DIR/merged_*.root" + +# This script will merge the csv files generated by the FullDataGenerator into a single file. +python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p + +python3 $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process + \ No newline at end of file From f97b81248b0b5dcdb1b069af0b779889b639c388 Mon Sep 17 00:00:00 2001 From: ragansu Date: Wed, 21 Aug 2024 09:38:55 -0700 Subject: [PATCH 2/3] typo fixed --- scripts/full_chain_post_process.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/full_chain_post_process.sh b/scripts/full_chain_post_process.sh index 0968ff7..ba11734 100755 --- a/scripts/full_chain_post_process.sh +++ b/scripts/full_chain_post_process.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --image=ragansu/fair-universe-data:test python3 +#SBATCH --image=ragansu/fair-universe-data:test #SBATCH --account=m4287 #SBATCH --qos=regular #SBATCH -N 1 @@ -54,4 +54,4 @@ srun -n 4 -c 64 shifter bash -c " $WorkDir/scripts/run_merger.sh \${process} ${input_dir} ${merged_dir} " -srun -n 1 c 256 shifter python3 Final_touches.py --input $merged_dir --output $output_dir --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_train_factor} --luminocity ${luminocity} +srun -n 1 -c 256 shifter python3 Final_touches.py --input $merged_dir --output $output_dir --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_train_factor} --luminocity ${luminocity} From 242d19e9ee0bebefb8ed142d57488318e23e6cee Mon Sep 17 00:00:00 2001 From: ragansu Date: Tue, 27 Aug 2024 02:37:20 -0700 Subject: [PATCH 3/3] updated codes and tested --- scripts/Final_touches.py | 4 +++ scripts/full_chain_post_process.sh | 58 ++++++++++++++---------------- scripts/run_batch.sh | 4 +-- scripts/run_merger.sh | 13 ++++--- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/scripts/Final_touches.py b/scripts/Final_touches.py index ff3a4a6..dce6479 100644 --- a/scripts/Final_touches.py +++ b/scripts/Final_touches.py @@ -249,6 +249,10 @@ def train_test_data_generator(full_data, test_factor=2, train_factor = 8): print("\n[*] -- full_data") for key in full_data.keys(): lhc_numbers = int(np.sum(full_data[key]["Weight"])) + print("lhc_numbers", lhc_numbers) + print("test_factor", test_factor) + print("train_factor", train_factor) + if key == "htautau": print(f"[*] --- {key} : {full_data[key].shape}") diff --git a/scripts/full_chain_post_process.sh b/scripts/full_chain_post_process.sh index ba11734..78a488f 100755 --- a/scripts/full_chain_post_process.sh +++ b/scripts/full_chain_post_process.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --image=ragansu/fair-universe-data:test -#SBATCH --account=m4287 +#SBATCH --account=dasrepo #SBATCH --qos=regular #SBATCH -N 1 #SBATCH --constraint=cpu @@ -9,33 +9,17 @@ now=$(date +"%Y%m%d") -# Parse command line arguments -while [[ $# -gt 0 ]]; do - case "$1" in - -p|--public-train-factor) - public_train_factor=$2 - shift 2 - ;; - -t|--public-test-factor) - public_test_factor=$2 - shift 2 - ;; - -f|--test-factor) - test_factor=$2 - shift 2 - ;; - -l|--luminocity) - luminocity=$2 - shift 2 - ;; - *) - echo "Invalid argument: $1" - exit 1 - ;; - esac -done - -working_dir=/global/cfs/cdirs/m4287/hep/genHEPdata + +# Set default values for the optional arguments +public_train_factor=100 +public_test_factor=60 +test_factor=60 +luminocity=10 # in fb^-1 + +# Define the working directory + +working_dir=/global/cfs/cdirs/m4287/hep +WorkDir=$working_dir/genHEPdata data_dir=$working_dir/NEW_DelphesPythia_data merged_dir=$data_dir/Merged_files output_dir=$data_dir/Full_data_files_$now @@ -43,15 +27,25 @@ output_dir=$data_dir/Full_data_files_$now Processes=("ttbar" "ztautau" "htautau" "diboson") # Use srun to execute the script with Shifter -srun -n 4 -c 64 shifter bash -c " +srun -n 4 -c 256 shifter bash -c " # Define the array inside the shifter environment Processes=('ttbar' 'ztautau' 'htautau' 'diboson') # Access the correct process based on SLURM_PROCID process=\${Processes[\$SLURM_PROCID]} - input_dir=$data_dir/csv_files_$process + input_dir=${data_dir}/csv_files_\${process} # Run the test script with the appropriate arguments - $WorkDir/scripts/run_merger.sh \${process} ${input_dir} ${merged_dir} + $WorkDir/scripts/run_merger.sh \${process} \${input_dir} ${merged_dir} " -srun -n 1 -c 256 shifter python3 Final_touches.py --input $merged_dir --output $output_dir --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_train_factor} --luminocity ${luminocity} +wait + +echo "All jobs are done!" + +# Merge the merged files into a single file +echo +echo "Merging the merged files into a single file" +echo +echo "Final touches" + +srun -n 1 -c 256 shifter python3 ${WorkDir}/scripts/Final_touches.py --input ${merged_dir} --output ${output_dir} --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_test_factor} --luminocity ${luminocity} diff --git a/scripts/run_batch.sh b/scripts/run_batch.sh index 160395e..d75d0b5 100644 --- a/scripts/run_batch.sh +++ b/scripts/run_batch.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --image=rootproject/root:6.28.04-ubuntu22.04 -#SBATCH --account=m4287 +#SBATCH --account=dasrepo #SBATCH --qos=regular #SBATCH -N 1 #SBATCH --constraint=cpu @@ -14,7 +14,7 @@ process=$2 label=$3 export WorkDir=/global/cfs/cdirs/m4287/hep/genHEPdata -srun -n 64 -c 4 shifter bash -c " +srun -n 128 -c 2 shifter bash -c " seed=\$((${main_seed}*64 + \$SLURM_PROCID)) $WorkDir/scripts/run_generator.sh \${seed} ${process} ${label} " diff --git a/scripts/run_merger.sh b/scripts/run_merger.sh index 8ae2cec..5e08d24 100755 --- a/scripts/run_merger.sh +++ b/scripts/run_merger.sh @@ -2,6 +2,7 @@ Process=$1 input_dir=$2 merged_dir=$3 +working_dir=/global/cfs/cdirs/m4287/hep # Pattern to match the ROOT files (e.g., *.root) FILES="$input_dir/*.root" @@ -16,11 +17,10 @@ BATCH_SIZE=1000 # Initialize the batch counter batch_num=1 +file_list=$input_dir/file_name_list.txt + # Loop over the files in batches of BATCH_SIZE -for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do - # Create a list of files for the current batch - file_list=$(ls -1 $FILES | sed -n "$((i + 1)),$((i + BATCH_SIZE))p") - +cat ${file_list} | xargs -n $BATCH_SIZE | while read -r file_list; do # Create the output file name for the current batch output_file="$OUTPUT_DIR/merged_${batch_num}.root" @@ -32,9 +32,12 @@ for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do done rm $input_dir/$Process.root -hadd ${input_dir}/${Process}.root "$OUTPUT_DIR/merged_*.root" +merged_file_list=$(ls -1 $OUTPUT_DIR/*.root) + +hadd ${input_dir}/${Process}.root $merged_file_list # This script will merge the csv files generated by the FullDataGenerator into a single file. + python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p python3 $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process