From 37e7666046aa92ce51f3d568f425f37039e4735d Mon Sep 17 00:00:00 2001 From: Andy Smith Date: Thu, 19 Sep 2024 14:29:17 +0100 Subject: [PATCH 1/9] Update script to for testing --- .gitignore | 2 + bash/run-pipeline-iteratively.sh | 72 +++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 480baef6..f9e27c63 100644 --- a/.gitignore +++ b/.gitignore @@ -294,3 +294,5 @@ objects.json python/tests/test_auth.csv python/debiasing/python-cmethods python/clim_recal/debiasing/python-cmethods +/bash/example_files +/bash/working diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index 65999a1e..e2a66dd7 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -2,27 +2,77 @@ set -e set -x -# Start-index goes from 1 -max_index=500 +script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo "Script dir: $script_dir" + # Input and output paths hads_input_path="/datadrive/HadsUKgrid/" cpm_input_path="/datadrive/UKCP2.2/" output_path="/datadrive/clim-recal-results/group_run_`date +%F-%H-%M`" + +# Other values used in local development +# hads_input_path="/Volumes/vmfileshare/ClimateData/Raw/HadsUKgrid" +hads_input_path="$script_dir/example_files/HadsUKgrid" +# cpm_input_path="/Volumes/vmfileshare/ClimateData/Raw/UKCP2.2" +cpm_input_path="$script_dir/example_files/UKCP2.2" +output_path="$script_dir/clim-recal-results/group_run_`date +%F-%H-%M`" + log_path="$output_path/logs" +# Temporary directories which will hold one year of data at a time +hads_working_dir="$script_dir/working/HadsUKgrid" +cpm_working_dir="$script_dir/working/UKCP2.2" + + +mkdir -p $hads_working_dir +mkdir -p $cpm_working_dir mkdir -p $log_path -for i in $(seq 0 $max_index); do - echo "Running for index={$i}" + +cpm_start_year=1980 +# cpm_end_year=2079 +cpm_end_year=1982 + +# First and last year that we have CPM data for +for year in $(seq $cpm_start_year $cpm_end_year); do + echo "Running for year={$year}" + + # Including `1201` in the filter, guarantees that we only match on the + # start year for each file, not the end year. + cpm_filter="*_${year}1201-*.nc" + # cpm_filter="*_198?1201-*.nc" + + # Copy the relevant CPM files into the working directory + # These options: + # 1. Maintain the directory structure + # 2. Include only the files that match the current year filter + # 3. Exclude all other files + rsync \ + --include="$cpm_filter" \ + --filter='-! */' \ + --recursive \ + --delete-excluded \ + $cpm_input_path \ + $cpm_working_dir + + # Copy the HADS files into the working directory + hads_filter="*_${year}??01-*.nc" + + rsync \ + --include="$hads_filter" \ + --filter='-! */' \ + --recursive \ + --delete-excluded \ + $hads_input_path \ + $hads_working_dir + { - clim-recal \ - --start-index $i \ - --total-from-index 1 \ - --hads-input-path $hads_input_path \ - --cpm-input-path $cpm_input_path \ + echo "clim-recal \ + --hads-input-path $hads_working_dir \ + --cpm-input-path $cpm_working_dir \ --output-path $output_path \ - --execute - } 2>&1 | tee $log_path/log_$i.txt + --execute" + } 2>&1 | tee $log_path/log_$year.txt done From 16da6d35f0e631d4fc7a1adf166de5bdee86d3f1 Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Fri, 20 Sep 2024 15:56:15 +0000 Subject: [PATCH 2/9] temp fix to hardcoded crop outfile path --- bash/run-pipeline-iteratively.sh | 14 +++++++------- compose.yml | 1 + python/clim_recal/resample.py | 7 ++++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index e2a66dd7..64bcaaa9 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -13,10 +13,10 @@ output_path="/datadrive/clim-recal-results/group_run_`date +%F-%H-%M`" # Other values used in local development # hads_input_path="/Volumes/vmfileshare/ClimateData/Raw/HadsUKgrid" -hads_input_path="$script_dir/example_files/HadsUKgrid" +# hads_input_path="$script_dir/example_files/HadsUKgrid" # cpm_input_path="/Volumes/vmfileshare/ClimateData/Raw/UKCP2.2" -cpm_input_path="$script_dir/example_files/UKCP2.2" -output_path="$script_dir/clim-recal-results/group_run_`date +%F-%H-%M`" +# cpm_input_path="$script_dir/example_files/UKCP2.2" +# output_path="$script_dir/clim-recal-results/group_run_`date +%F-%H-%M`" log_path="$output_path/logs" @@ -31,8 +31,8 @@ mkdir -p $log_path cpm_start_year=1980 -# cpm_end_year=2079 -cpm_end_year=1982 +cpm_end_year=2079 +# cpm_end_year=1982 # First and last year that we have CPM data for for year in $(seq $cpm_start_year $cpm_end_year); do @@ -68,11 +68,11 @@ for year in $(seq $cpm_start_year $cpm_end_year); do $hads_working_dir { - echo "clim-recal \ + clim-recal \ --hads-input-path $hads_working_dir \ --cpm-input-path $cpm_working_dir \ --output-path $output_path \ - --execute" + --execute } 2>&1 | tee $log_path/log_$year.txt done diff --git a/compose.yml b/compose.yml index 71c13e82..00b12b2b 100644 --- a/compose.yml +++ b/compose.yml @@ -15,6 +15,7 @@ services: - JUPYTER_ENABLE_LAB=yes volumes: - .:/home/jovyan:rw + - /datadrive:/datadrive docs: build: diff --git a/python/clim_recal/resample.py b/python/clim_recal/resample.py index 6dd15105..4618cb21 100644 --- a/python/clim_recal/resample.py +++ b/python/clim_recal/resample.py @@ -60,9 +60,10 @@ CFCalendarSTANDARD: Final[str] = "standard" -RESAMPLING_OUTPUT_PATH: Final[PathLike] = ( - CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample" -) +# RESAMPLING_OUTPUT_PATH: Final[PathLike] = ( +# CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample" +#) +RESAMPLING_OUTPUT_PATH: Final[PathLike] = "/datadrive/clim-recal-results/cropped" RAW_HADS_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/HadsUKgrid" RAW_CPM_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/UKCP2.2" RAW_HADS_TASMAX_PATH: Final[PathLike] = RAW_HADS_PATH / "tasmax/day" From 51e384bbf0b6bca23ab45b462f402f3b76fa67b4 Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:17:46 +0000 Subject: [PATCH 3/9] add CPM model and ensemble params to script --- bash/run-pipeline-iteratively.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index 64bcaaa9..4c7cd201 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -72,6 +72,13 @@ for year in $(seq $cpm_start_year $cpm_end_year); do --hads-input-path $hads_working_dir \ --cpm-input-path $cpm_working_dir \ --output-path $output_path \ + --all-variables \ + --all-regions \ + --run 01 \ + --run 05 \ + --run 06 \ + --run 07 \ + --run 08 \ --execute } 2>&1 | tee $log_path/log_$year.txt From 470ef93e1776083447d606d9937d4e31619a9e9e Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:18:15 +0000 Subject: [PATCH 4/9] add script to combine output dirs --- bash/combine-iterative-runs.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100755 bash/combine-iterative-runs.sh diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh new file mode 100755 index 00000000..3711a71b --- /dev/null +++ b/bash/combine-iterative-runs.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Run this script from the root of the `group_run_*` directory + + +combined_dir=./combined_output +mkdir -p $combined_dir + +# Get all output directories +output_dirs=`find . -type d -name "run_*"` + +for output_dir in $output_dirs; do + + # The trailling slash on the `$output_dir` is required! + rsync \ + --recursive \ + --verbose \ + $output_dir/ \ + $combined_dir +done From b9137e752ace2577e252adee6d785f060364ae5f Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:21:15 +0000 Subject: [PATCH 5/9] minor tweaks --- bash/combine-iterative-runs.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh index 3711a71b..64dbccb5 100755 --- a/bash/combine-iterative-runs.sh +++ b/bash/combine-iterative-runs.sh @@ -2,7 +2,6 @@ # Run this script from the root of the `group_run_*` directory - combined_dir=./combined_output mkdir -p $combined_dir From d32f72d0a720a68ee07f74bd0a65277e872b21d2 Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:00:53 +0000 Subject: [PATCH 6/9] add cmd to remove extraneous crop files --- bash/combine-iterative-runs.sh | 8 ++++++ bash/remove-extra-cropfiles.py | 42 ++++++++++++++++++++++++++++++++ bash/run-pipeline-iteratively.sh | 9 ++++--- 3 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 bash/remove-extra-cropfiles.py diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh index 64dbccb5..f20ceed8 100755 --- a/bash/combine-iterative-runs.sh +++ b/bash/combine-iterative-runs.sh @@ -2,7 +2,15 @@ # Run this script from the root of the `group_run_*` directory +# /datadrive/clim-recal-results/group_run_2024-09-26-15-11 +# Manaually add in 1981 data from +# /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 + + combined_dir=./combined_output + +# combined_dir=/mnt/vmfileshare/ClimateData/processed_2024_09_26/combined_output + mkdir -p $combined_dir # Get all output directories diff --git a/bash/remove-extra-cropfiles.py b/bash/remove-extra-cropfiles.py new file mode 100644 index 00000000..5cb17614 --- /dev/null +++ b/bash/remove-extra-cropfiles.py @@ -0,0 +1,42 @@ +import sys +import pathlib +import re + +# get input dir +root_dir = sys.argv[1] +root_dir = pathlib.Path(root_dir).resolve() +print(f"root_dir={root_dir}") + +do_delete = False + +try: + print(f"secound_arg={sys.argv[2]}") + if sys.argv[2] == "--I-am-really-sure-I-want-to-delete-lots-of-files": + do_delete = True +except IndexError: + pass + +# There is certaining a better, more pythonic way to do this +# But I had already created and tested this regex, to work with ripgrep, before +# finding that it was too hard to install repgrep within the docker image +re_str = r"(\/resample\/(cpm|hads)\/.+\.nc|\/crops\/hads\/(?P(Scotland|Glasgow|Manchester|London))\/(?P(tasmin|tasmax|rainfall))\/crop_(?P=h_region)_(?P=h_var)_hads_\d{8}-\d{8}\.nc|\/crops\/cpm\/(?P(Scotland|Glasgow|Manchester|London))\/(?P(tasmin|tasmax|pr))\/(?P(01|05|06|07|08))\/crop_(?P=c_region)_(?P=c_var)_cpm_(?P=emsb)_\d{8}-\d{8}\.nc)" + +find_valid_files = re.compile(re_str) + +i_kept = 0 +i_deleted = 0 + +for root, dirs, files in root_dir.walk(top_down=True): + for name in files: + full_name = (root / name).resolve() + + if not find_valid_files.search(str(full_name)): + # print(f"delete={full_name}") + i_deleted += 1 + if do_delete: + full_name.unlink() + else: + # print(f"keep={full_name}") + i_kept += 1 + +print(f"found {i_deleted} files could be deleted and {i_kept} that should be kept") diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index 4c7cd201..22eb772a 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -21,8 +21,8 @@ output_path="/datadrive/clim-recal-results/group_run_`date +%F-%H-%M`" log_path="$output_path/logs" # Temporary directories which will hold one year of data at a time -hads_working_dir="$script_dir/working/HadsUKgrid" -cpm_working_dir="$script_dir/working/UKCP2.2" +hads_working_dir="$output_path/working/HadsUKgrid" +cpm_working_dir="$output_path/working/UKCP2.2" mkdir -p $hads_working_dir @@ -31,7 +31,7 @@ mkdir -p $log_path cpm_start_year=1980 -cpm_end_year=2079 +cpm_end_year=1980 # cpm_end_year=1982 # First and last year that we have CPM data for @@ -82,4 +82,7 @@ for year in $(seq $cpm_start_year $cpm_end_year); do --execute } 2>&1 | tee $log_path/log_$year.txt + # Delete extraneous crop files + find $output_path -type d -name 'run_*' | xargs -I {} python $script_dir/remove-extra-cropfiles.py {} --I-am-really-sure-I-want-to-delete-lots-of-files + done From d4a74380b65d497ce653d1af386fac26a97965ee Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:25:59 +0000 Subject: [PATCH 7/9] minor tweaks to re-run 1982 data --- bash/combine-iterative-runs.sh | 4 ++++ bash/run-pipeline-iteratively.sh | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh index f20ceed8..c1af4a9a 100755 --- a/bash/combine-iterative-runs.sh +++ b/bash/combine-iterative-runs.sh @@ -6,6 +6,9 @@ # Manaually add in 1981 data from # /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 +# Manaually add in 1981 data from +# /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 + combined_dir=./combined_output @@ -22,6 +25,7 @@ for output_dir in $output_dirs; do rsync \ --recursive \ --verbose \ + --ignore-existing \ $output_dir/ \ $combined_dir done diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index 22eb772a..a18ae12e 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -30,8 +30,8 @@ mkdir -p $cpm_working_dir mkdir -p $log_path -cpm_start_year=1980 -cpm_end_year=1980 +cpm_start_year=1982 +cpm_end_year=1982 # cpm_end_year=1982 # First and last year that we have CPM data for From 5fdc2223671d211315a131307fcd9e3bd531364b Mon Sep 17 00:00:00 2001 From: Andy Smith <5346065+andrewphilipsmith@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:26:58 +0000 Subject: [PATCH 8/9] minor tweaks to re-run 1982 data --- bash/combine-iterative-runs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh index c1af4a9a..4e859784 100755 --- a/bash/combine-iterative-runs.sh +++ b/bash/combine-iterative-runs.sh @@ -6,8 +6,8 @@ # Manaually add in 1981 data from # /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 -# Manaually add in 1981 data from -# /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 +# Manaually add in 1982 data from +# /datadrive/clim-recal-results/group_run_2024-10-07-12-31/run_24-10-07_12-37 combined_dir=./combined_output From 941c8a5cfa453619e4e5f79c2d28497165347f86 Mon Sep 17 00:00:00 2001 From: Andy Smith Date: Thu, 10 Oct 2024 18:22:17 +0100 Subject: [PATCH 9/9] minor linting fixes --- bash/combine-iterative-runs.sh | 4 ++-- bash/remove-extra-cropfiles.py | 2 +- bash/run-pipeline-iteratively.sh | 2 +- python/clim_recal/resample.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh index 4e859784..fcd39748 100755 --- a/bash/combine-iterative-runs.sh +++ b/bash/combine-iterative-runs.sh @@ -3,10 +3,10 @@ # Run this script from the root of the `group_run_*` directory # /datadrive/clim-recal-results/group_run_2024-09-26-15-11 -# Manaually add in 1981 data from +# Manaually add in 1981 data from # /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07 -# Manaually add in 1982 data from +# Manaually add in 1982 data from # /datadrive/clim-recal-results/group_run_2024-10-07-12-31/run_24-10-07_12-37 diff --git a/bash/remove-extra-cropfiles.py b/bash/remove-extra-cropfiles.py index 5cb17614..ec34b50f 100644 --- a/bash/remove-extra-cropfiles.py +++ b/bash/remove-extra-cropfiles.py @@ -1,6 +1,6 @@ -import sys import pathlib import re +import sys # get input dir root_dir = sys.argv[1] diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh index a18ae12e..72523435 100755 --- a/bash/run-pipeline-iteratively.sh +++ b/bash/run-pipeline-iteratively.sh @@ -38,7 +38,7 @@ cpm_end_year=1982 for year in $(seq $cpm_start_year $cpm_end_year); do echo "Running for year={$year}" - # Including `1201` in the filter, guarantees that we only match on the + # Including `1201` in the filter, guarantees that we only match on the # start year for each file, not the end year. cpm_filter="*_${year}1201-*.nc" # cpm_filter="*_198?1201-*.nc" diff --git a/python/clim_recal/resample.py b/python/clim_recal/resample.py index 4618cb21..99878f0e 100644 --- a/python/clim_recal/resample.py +++ b/python/clim_recal/resample.py @@ -62,7 +62,7 @@ # RESAMPLING_OUTPUT_PATH: Final[PathLike] = ( # CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample" -#) +# ) RESAMPLING_OUTPUT_PATH: Final[PathLike] = "/datadrive/clim-recal-results/cropped" RAW_HADS_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/HadsUKgrid" RAW_CPM_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/UKCP2.2"