Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimising and reworking final touches with srun #8

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions scripts/Final_touches.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ def train_test_data_generator(full_data, test_factor=2, train_factor = 8):
print("\n[*] -- full_data")
for key in full_data.keys():
lhc_numbers = int(np.sum(full_data[key]["Weight"]))
print("lhc_numbers", lhc_numbers)
print("test_factor", test_factor)
print("train_factor", train_factor)


if key == "htautau":
print(f"[*] --- {key} : {full_data[key].shape}")
Expand Down
98 changes: 39 additions & 59 deletions scripts/full_chain_post_process.sh
Original file line number Diff line number Diff line change
@@ -1,71 +1,51 @@
#!/bin/bash
#SBATCH --image=rootproject/root:6.28.04-ubuntu22.04
#SBATCH --account=m4287
#SBATCH --qos=shared
#SBATCH --tasks-per-node=1
#SBATCH --image=ragansu/fair-universe-data:test
#SBATCH --account=dasrepo
#SBATCH --qos=regular
#SBATCH -N 1
#SBATCH --constraint=cpu
#SBATCH -t 4:00:00
#SBATCH -J Final_touches


now=$(date +"%Y%m%d")


# Set default values for the optional arguments
public_train_factor=100
public_test_factor=60
test_factor=60
luminocity=10 # in fb^-1

# Define the working directory

working_dir=/global/cfs/cdirs/m4287/hep
merged_dir=$working_dir/Delphes_PYTHIA8_output/Merged_files
output_dir=$working_dir/Delphes_PYTHIA8_output/Full_data_files_$now
WorkDir=$working_dir/genHEPdata
data_dir=$working_dir/NEW_DelphesPythia_data
merged_dir=$data_dir/Merged_files
output_dir=$data_dir/Full_data_files_$now

Processes=("ttbar" "ztautau" "htautau" "diboson")

# Use srun to execute the script with Shifter
srun -n 4 -c 256 shifter bash -c "
# Define the array inside the shifter environment
Processes=('ttbar' 'ztautau' 'htautau' 'diboson')

# Access the correct process based on SLURM_PROCID
process=\${Processes[\$SLURM_PROCID]}
input_dir=${data_dir}/csv_files_\${process}
# Run the test script with the appropriate arguments
$WorkDir/scripts/run_merger.sh \${process} \${input_dir} ${merged_dir}
"

wait

echo "All jobs are done!"

for Process in ttbar htautau diboson ztautau
do
{

input_dir=$working_dir/Delphes_PYTHIA8_output/csv_files_$Process
# Pattern to match the ROOT files (e.g., *.root)
FILES="$input_dir/*.root"

# Output directory for the merged files
OUTPUT_DIR="$input_dir/merged_files"
mkdir -p "$OUTPUT_DIR"

# Number of files to merge per batch
BATCH_SIZE=1000

# Initialize the batch counter
batch_num=1

# Loop over the files in batches of BATCH_SIZE
for ((i = 0; i < $(ls -1 $FILES | wc -l); i += BATCH_SIZE)); do
# Create a list of files for the current batch
file_list=$(ls -1 $FILES | sed -n "$((i + 1)),$((i + BATCH_SIZE))p")

# Create the output file name for the current batch
output_file="$OUTPUT_DIR/merged_${batch_num}.root"

# Run hadd to merge the current batch of files
shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd -f "$output_file" $file_list

# Increment the batch counter
batch_num=$((batch_num + 1))
done

rm $input_dir/$Process.root
shifter --image=rootproject/root:6.28.04-ubuntu22.04 hadd ${input_dir}/${Process}.root "$OUTPUT_DIR/merged_*.root"

# This script will merge the csv files generated by the FullDataGenerator into a single file.
shifter --image=nersc/fair_universe:1298f0a8 python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p

shifter --image=rootproject/root:6.28.04-ubuntu22.04 python $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process

}
done
# Merge the merged files into a single file
echo
echo "Merging the merged files into a single file"
echo
echo "Final touches"

shifter --image=nersc/fair_universe:1298f0a8 python3 Final_touches.py \
--input $merged_dir \
--output $output_dir \
--input-format "parquet" \
--output-format "parquet" \
--derived-quantities \
--test-factor 10 \
--public-train-factor 34 \
--public-test-factor 5
srun -n 1 -c 256 shifter python3 ${WorkDir}/scripts/Final_touches.py --input ${merged_dir} --output ${output_dir} --input-format "parquet" --output-format "parquet" --test-factor ${test_factor} --public-train-factor ${public_train_factor} --public-test-factor ${public_test_factor} --luminocity ${luminocity}
4 changes: 2 additions & 2 deletions scripts/run_batch.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#SBATCH --image=rootproject/root:6.28.04-ubuntu22.04
#SBATCH --account=m4287
#SBATCH --account=dasrepo
#SBATCH --qos=regular
#SBATCH -N 1
#SBATCH --constraint=cpu
Expand All @@ -14,7 +14,7 @@ process=$2
label=$3
export WorkDir=/global/cfs/cdirs/m4287/hep/genHEPdata

srun -n 64 -c 4 shifter bash -c "
srun -n 128 -c 2 shifter bash -c "
seed=\$((${main_seed}*64 + \$SLURM_PROCID))
$WorkDir/scripts/run_generator.sh \${seed} ${process} ${label}
"
44 changes: 44 additions & 0 deletions scripts/run_merger.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

Process=$1
input_dir=$2
merged_dir=$3
working_dir=/global/cfs/cdirs/m4287/hep

# Pattern to match the ROOT files (e.g., *.root)
FILES="$input_dir/*.root"

# Output directory for the merged files
OUTPUT_DIR="$input_dir/merged_files"
mkdir -p "$OUTPUT_DIR"

# Number of files to merge per batch
BATCH_SIZE=1000

# Initialize the batch counter
batch_num=1

file_list=$input_dir/file_name_list.txt

# Loop over the files in batches of BATCH_SIZE
cat ${file_list} | xargs -n $BATCH_SIZE | while read -r file_list; do
# Create the output file name for the current batch
output_file="$OUTPUT_DIR/merged_${batch_num}.root"

# Run hadd to merge the current batch of files
hadd -f "$output_file" $file_list

# Increment the batch counter
batch_num=$((batch_num + 1))
done

rm $input_dir/$Process.root
merged_file_list=$(ls -1 $OUTPUT_DIR/*.root)

hadd ${input_dir}/${Process}.root $merged_file_list

# This script will merge the csv files generated by the FullDataGenerator into a single file.

python3 $working_dir/genHEPdata/scripts/Data_merger.py --input $input_dir --output $merged_dir/$Process -p

python3 $working_dir/genHEPdata/scripts/process_counter.py $input_dir $merged_dir $Process