diff --git a/scripts/Final_touches.py b/scripts/Final_touches.py index ff3a4a6..dce6479 100644 --- a/scripts/Final_touches.py +++ b/scripts/Final_touches.py @@ -249,6 +249,10 @@ def train_test_data_generator(full_data, test_factor=2, train_factor = 8): print("\n[*] -- full_data") for key in full_data.keys(): lhc_numbers = int(np.sum(full_data[key]["Weight"])) + print("lhc_numbers", lhc_numbers) + print("test_factor", test_factor) + print("train_factor", train_factor) + if key == "htautau": print(f"[*] --- {key} : {full_data[key].shape}") diff --git a/scripts/full_chain_post_process.sh b/scripts/full_chain_post_process.sh index 44b68c1..78a488f 100755 --- a/scripts/full_chain_post_process.sh +++ b/scripts/full_chain_post_process.sh @@ -1,71 +1,51 @@ #!/bin/bash -#SBATCH --image=rootproject/root:6.28.04-ubuntu22.04 -#SBATCH --account=m4287 -#SBATCH --qos=shared -#SBATCH --tasks-per-node=1 +#SBATCH --image=ragansu/fair-universe-data:test +#SBATCH --account=dasrepo +#SBATCH --qos=regular +#SBATCH -N 1 #SBATCH --constraint=cpu #SBATCH -t 4:00:00 #SBATCH -J Final_touches - now=$(date +"%Y%m%d") + +# Set default values for the optional arguments +public_train_factor=100 +public_test_factor=60 +test_factor=60 +luminocity=10 # in fb^-1 + +# Define the working directory + working_dir=/global/cfs/cdirs/m4287/hep -merged_dir=$working_dir/Delphes_PYTHIA8_output/Merged_files -output_dir=$working_dir/Delphes_PYTHIA8_output/Full_data_files_$now +WorkDir=$working_dir/genHEPdata +data_dir=$working_dir/NEW_DelphesPythia_data +merged_dir=$data_dir/Merged_files +output_dir=$data_dir/Full_data_files_$now + +Processes=("ttbar" "ztautau" "htautau" "diboson") + +# Use srun to execute the script with Shifter +srun -n 4 -c 256 shifter bash -c " + # Define the array inside the shifter environment + Processes=('ttbar' 'ztautau' 'htautau' 'diboson') + + # Access the correct process based on SLURM_PROCID + process=\${Processes[\$SLURM_PROCID]} + input_dir=${data_dir}/csv_files_\${process} + # Run the test script with the appropriate arguments + $WorkDir/scripts/run_merger.sh \${process} \${input_dir} ${merged_dir} +" +wait +echo "All jobs are done!" -for Process in ttbar htautau diboson ztautau -do - { - - input_dir=$working_dir/Delphes_PYTHIA8_output/csv_files_$Process - # Pattern to match the ROOT files (e.g., *.root) - FILES="$input_dir/*.root" - - # Output directory for the merged files - OUTPUT_DIR="$input_dir/merged_files" - mkdir -p "$OUTPUT_DIR" - - # Number of files to merge per batch - BATCH_SIZE=1000 - - # Initialize the batch counter - batch_num=1 - - # Loop over the files in batches of BATCH_SIZE - for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do - # Create a list of files for the current batch - file_list=$(ls -1 $FILES | sed -n "$((i + 1)),$((i + BATCH_SIZE))p") - - # Create the output file name for the current batch - output_file="$OUTPUT_DIR/merged_${batch_num}.root" - - # Run hadd to merge the current batch of files - shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd -f "$output_file" $file_list - - # Increment the batch counter - batch_num=$((batch_num + 1)) - done - - rm $input_dir/$Process.root - shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd ${input_dir}/${Process}.root "$OUTPUT_DIR/merged_*.root" - - # This script will merge the csv files generated by the FullDataGenerator into a single file. - shifter --image=nersc/fair_universe:1298f0a8 python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p - - shifter --image=rootproject/root:6.28.04-ubuntu22.04 python $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process - - } -done +# Merge the merged files into a single file +echo +echo "Merging the merged files into a single file" +echo +echo "Final touches" -shifter --image=nersc/fair_universe:1298f0a8 python3 Final_touches.py \ ---input $merged_dir \ ---output $output_dir \ ---input-format "parquet" \ ---output-format "parquet" \ ---derived-quantities \ ---test-factor 10 \ ---public-train-factor 34 \ ---public-test-factor 5 \ No newline at end of file +srun -n 1 -c 256 shifter python3 ${WorkDir}/scripts/Final_touches.py --input ${merged_dir} --output ${output_dir} --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_test_factor} --luminocity ${luminocity} diff --git a/scripts/run_batch.sh b/scripts/run_batch.sh index 160395e..d75d0b5 100644 --- a/scripts/run_batch.sh +++ b/scripts/run_batch.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --image=rootproject/root:6.28.04-ubuntu22.04 -#SBATCH --account=m4287 +#SBATCH --account=dasrepo #SBATCH --qos=regular #SBATCH -N 1 #SBATCH --constraint=cpu @@ -14,7 +14,7 @@ process=$2 label=$3 export WorkDir=/global/cfs/cdirs/m4287/hep/genHEPdata -srun -n 64 -c 4 shifter bash -c " +srun -n 128 -c 2 shifter bash -c " seed=\$((${main_seed}*64 + \$SLURM_PROCID)) $WorkDir/scripts/run_generator.sh \${seed} ${process} ${label} " diff --git a/scripts/run_merger.sh b/scripts/run_merger.sh new file mode 100755 index 0000000..5e08d24 --- /dev/null +++ b/scripts/run_merger.sh @@ -0,0 +1,44 @@ + +Process=$1 +input_dir=$2 +merged_dir=$3 +working_dir=/global/cfs/cdirs/m4287/hep + +# Pattern to match the ROOT files (e.g., *.root) +FILES="$input_dir/*.root" + +# Output directory for the merged files +OUTPUT_DIR="$input_dir/merged_files" +mkdir -p "$OUTPUT_DIR" + +# Number of files to merge per batch +BATCH_SIZE=1000 + +# Initialize the batch counter +batch_num=1 + +file_list=$input_dir/file_name_list.txt + +# Loop over the files in batches of BATCH_SIZE +cat ${file_list} | xargs -n $BATCH_SIZE | while read -r file_list; do + # Create the output file name for the current batch + output_file="$OUTPUT_DIR/merged_${batch_num}.root" + + # Run hadd to merge the current batch of files + hadd -f "$output_file" $file_list + + # Increment the batch counter + batch_num=$((batch_num + 1)) +done + +rm $input_dir/$Process.root +merged_file_list=$(ls -1 $OUTPUT_DIR/*.root) + +hadd ${input_dir}/${Process}.root $merged_file_list + +# This script will merge the csv files generated by the FullDataGenerator into a single file. + +python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p + +python3 $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process + \ No newline at end of file