Merge pull request #170 from alan-turing-institute/use-rsync-to-filte…

…r-years Update `run-pipeline-iteratively.sh to create local dir structure with subset of data
alan-turing-institute · Oct 11, 2024 · 49a2903 · 49a2903
2 parents 9b21737 + 941c8a5
commit 49a2903
Show file tree

Hide file tree

Showing 6 changed files with 149 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -294,3 +294,5 @@ objects.json
 python/tests/test_auth.csv
 python/debiasing/python-cmethods
 python/clim_recal/debiasing/python-cmethods
+/bash/example_files
+/bash/working
diff --git a/bash/combine-iterative-runs.sh b/bash/combine-iterative-runs.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Run this script from the root of the `group_run_*` directory
+
+# /datadrive/clim-recal-results/group_run_2024-09-26-15-11
+# Manaually add in 1981 data from
+# /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07
+
+# Manaually add in 1982 data from
+# /datadrive/clim-recal-results/group_run_2024-10-07-12-31/run_24-10-07_12-37
+
+
+combined_dir=./combined_output
+
+# combined_dir=/mnt/vmfileshare/ClimateData/processed_2024_09_26/combined_output
+
+mkdir -p $combined_dir
+
+# Get all output directories
+output_dirs=`find . -type d -name "run_*"`
+
+for output_dir in $output_dirs; do
+
+    # The trailling slash on the `$output_dir` is required!
+    rsync \
+      --recursive \
+      --verbose \
+      --ignore-existing \
+      $output_dir/ \
+      $combined_dir
+done
diff --git a/bash/remove-extra-cropfiles.py b/bash/remove-extra-cropfiles.py
@@ -0,0 +1,42 @@
+import pathlib
+import re
+import sys
+
+# get input dir
+root_dir = sys.argv[1]
+root_dir = pathlib.Path(root_dir).resolve()
+print(f"root_dir={root_dir}")
+
+do_delete = False
+
+try:
+    print(f"secound_arg={sys.argv[2]}")
+    if sys.argv[2] == "--I-am-really-sure-I-want-to-delete-lots-of-files":
+        do_delete = True
+except IndexError:
+    pass
+
+# There is certaining a better, more pythonic way to do this
+# But I had already created and tested this regex, to work with ripgrep, before
+# finding that it was too hard to install repgrep within the docker image
+re_str = r"(\/resample\/(cpm|hads)\/.+\.nc|\/crops\/hads\/(?P<h_region>(Scotland|Glasgow|Manchester|London))\/(?P<h_var>(tasmin|tasmax|rainfall))\/crop_(?P=h_region)_(?P=h_var)_hads_\d{8}-\d{8}\.nc|\/crops\/cpm\/(?P<c_region>(Scotland|Glasgow|Manchester|London))\/(?P<c_var>(tasmin|tasmax|pr))\/(?P<emsb>(01|05|06|07|08))\/crop_(?P=c_region)_(?P=c_var)_cpm_(?P=emsb)_\d{8}-\d{8}\.nc)"
+
+find_valid_files = re.compile(re_str)
+
+i_kept = 0
+i_deleted = 0
+
+for root, dirs, files in root_dir.walk(top_down=True):
+    for name in files:
+        full_name = (root / name).resolve()
+
+        if not find_valid_files.search(str(full_name)):
+            # print(f"delete={full_name}")
+            i_deleted += 1
+            if do_delete:
+                full_name.unlink()
+        else:
+            # print(f"keep={full_name}")
+            i_kept += 1
+
+print(f"found {i_deleted} files could be deleted and {i_kept} that should be kept")
diff --git a/bash/run-pipeline-iteratively.sh b/bash/run-pipeline-iteratively.sh
@@ -2,27 +2,87 @@
 set -e
 set -x
 
-# Start-index goes from 1
-max_index=500
+script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+echo "Script dir: $script_dir"
+
 
 # Input and output paths
 hads_input_path="/datadrive/HadsUKgrid/"
 cpm_input_path="/datadrive/UKCP2.2/"
 output_path="/datadrive/clim-recal-results/group_run_`date +%F-%H-%M`"
+
+# Other values used in local development
+# hads_input_path="/Volumes/vmfileshare/ClimateData/Raw/HadsUKgrid"
+# hads_input_path="$script_dir/example_files/HadsUKgrid"
+# cpm_input_path="/Volumes/vmfileshare/ClimateData/Raw/UKCP2.2"
+# cpm_input_path="$script_dir/example_files/UKCP2.2"
+# output_path="$script_dir/clim-recal-results/group_run_`date +%F-%H-%M`"
+
 log_path="$output_path/logs"
 
+# Temporary directories which will hold one year of data at a time
+hads_working_dir="$output_path/working/HadsUKgrid"
+cpm_working_dir="$output_path/working/UKCP2.2"
+
+
+mkdir -p $hads_working_dir
+mkdir -p $cpm_working_dir
 mkdir -p $log_path
 
-for i in $(seq 0 $max_index); do
-  echo "Running for index={$i}"
+
+cpm_start_year=1982
+cpm_end_year=1982
+# cpm_end_year=1982
+
+# First and last year that we have CPM data for
+for year in $(seq $cpm_start_year $cpm_end_year); do
+  echo "Running for year={$year}"
+
+  # Including `1201` in the filter, guarantees that we only match on the
+  # start year for each file, not the end year.
+  cpm_filter="*_${year}1201-*.nc"
+  # cpm_filter="*_198?1201-*.nc"
+
+  # Copy the relevant CPM files into the working directory
+  # These options:
+  # 1. Maintain the directory structure
+  # 2. Include only the files that match the current year filter
+  # 3. Exclude all other files
+  rsync \
+    --include="$cpm_filter" \
+    --filter='-! */' \
+    --recursive \
+    --delete-excluded \
+    $cpm_input_path \
+    $cpm_working_dir
+
+  # Copy the HADS files into the working directory
+  hads_filter="*_${year}??01-*.nc"
+
+  rsync \
+    --include="$hads_filter" \
+    --filter='-! */' \
+    --recursive \
+    --delete-excluded \
+    $hads_input_path \
+    $hads_working_dir
+
   {
     clim-recal \
-    --start-index $i \
-    --total-from-index 1 \
-    --hads-input-path $hads_input_path \
-    --cpm-input-path $cpm_input_path \
+    --hads-input-path $hads_working_dir \
+    --cpm-input-path $cpm_working_dir \
     --output-path $output_path \
+    --all-variables \
+    --all-regions \
+    --run 01 \
+    --run 05 \
+    --run 06 \
+    --run 07 \
+    --run 08 \
     --execute
-   } 2>&1 | tee $log_path/log_$i.txt
+   } 2>&1 | tee $log_path/log_$year.txt
+
+  # Delete extraneous crop files
+  find $output_path -type d -name 'run_*' | xargs -I {} python $script_dir/remove-extra-cropfiles.py {} --I-am-really-sure-I-want-to-delete-lots-of-files
 
 done
diff --git a/compose.yml b/compose.yml
@@ -15,6 +15,7 @@ services:
       - JUPYTER_ENABLE_LAB=yes
     volumes:
       - .:/home/jovyan:rw
+      - /datadrive:/datadrive
 
   docs:
     build:

diff --git a/python/clim_recal/resample.py b/python/clim_recal/resample.py
@@ -60,9 +60,10 @@
 
 CFCalendarSTANDARD: Final[str] = "standard"
 
-RESAMPLING_OUTPUT_PATH: Final[PathLike] = (
-    CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample"
-)
+# RESAMPLING_OUTPUT_PATH: Final[PathLike] = (
+#    CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample"
+# )
+RESAMPLING_OUTPUT_PATH: Final[PathLike] = "/datadrive/clim-recal-results/cropped"
 RAW_HADS_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/HadsUKgrid"
 RAW_CPM_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/UKCP2.2"
 RAW_HADS_TASMAX_PATH: Final[PathLike] = RAW_HADS_PATH / "tasmax/day"