Skip to content

Commit

Permalink
Merge pull request #170 from alan-turing-institute/use-rsync-to-filte…
Browse files Browse the repository at this point in the history
…r-years

Update `run-pipeline-iteratively.sh to create local dir structure with subset of data
  • Loading branch information
andrewphilipsmith authored Oct 11, 2024
2 parents 9b21737 + 941c8a5 commit 49a2903
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 12 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,5 @@ objects.json
python/tests/test_auth.csv
python/debiasing/python-cmethods
python/clim_recal/debiasing/python-cmethods
/bash/example_files
/bash/working
31 changes: 31 additions & 0 deletions bash/combine-iterative-runs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Run this script from the root of the `group_run_*` directory

# /datadrive/clim-recal-results/group_run_2024-09-26-15-11
# Manaually add in 1981 data from
# /datadrive/clim-recal-results/group_run_2024-09-30-16-04/run_24-09-30_16-07

# Manaually add in 1982 data from
# /datadrive/clim-recal-results/group_run_2024-10-07-12-31/run_24-10-07_12-37


combined_dir=./combined_output

# combined_dir=/mnt/vmfileshare/ClimateData/processed_2024_09_26/combined_output

mkdir -p $combined_dir

# Get all output directories
output_dirs=`find . -type d -name "run_*"`

for output_dir in $output_dirs; do

# The trailling slash on the `$output_dir` is required!
rsync \
--recursive \
--verbose \
--ignore-existing \
$output_dir/ \
$combined_dir
done
42 changes: 42 additions & 0 deletions bash/remove-extra-cropfiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pathlib
import re
import sys

# get input dir
root_dir = sys.argv[1]
root_dir = pathlib.Path(root_dir).resolve()
print(f"root_dir={root_dir}")

do_delete = False

try:
print(f"secound_arg={sys.argv[2]}")
if sys.argv[2] == "--I-am-really-sure-I-want-to-delete-lots-of-files":
do_delete = True
except IndexError:
pass

# There is certaining a better, more pythonic way to do this
# But I had already created and tested this regex, to work with ripgrep, before
# finding that it was too hard to install repgrep within the docker image
re_str = r"(\/resample\/(cpm|hads)\/.+\.nc|\/crops\/hads\/(?P<h_region>(Scotland|Glasgow|Manchester|London))\/(?P<h_var>(tasmin|tasmax|rainfall))\/crop_(?P=h_region)_(?P=h_var)_hads_\d{8}-\d{8}\.nc|\/crops\/cpm\/(?P<c_region>(Scotland|Glasgow|Manchester|London))\/(?P<c_var>(tasmin|tasmax|pr))\/(?P<emsb>(01|05|06|07|08))\/crop_(?P=c_region)_(?P=c_var)_cpm_(?P=emsb)_\d{8}-\d{8}\.nc)"

find_valid_files = re.compile(re_str)

i_kept = 0
i_deleted = 0

for root, dirs, files in root_dir.walk(top_down=True):
for name in files:
full_name = (root / name).resolve()

if not find_valid_files.search(str(full_name)):
# print(f"delete={full_name}")
i_deleted += 1
if do_delete:
full_name.unlink()
else:
# print(f"keep={full_name}")
i_kept += 1

print(f"found {i_deleted} files could be deleted and {i_kept} that should be kept")
78 changes: 69 additions & 9 deletions bash/run-pipeline-iteratively.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,87 @@
set -e
set -x

# Start-index goes from 1
max_index=500
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
echo "Script dir: $script_dir"


# Input and output paths
hads_input_path="/datadrive/HadsUKgrid/"
cpm_input_path="/datadrive/UKCP2.2/"
output_path="/datadrive/clim-recal-results/group_run_`date +%F-%H-%M`"

# Other values used in local development
# hads_input_path="/Volumes/vmfileshare/ClimateData/Raw/HadsUKgrid"
# hads_input_path="$script_dir/example_files/HadsUKgrid"
# cpm_input_path="/Volumes/vmfileshare/ClimateData/Raw/UKCP2.2"
# cpm_input_path="$script_dir/example_files/UKCP2.2"
# output_path="$script_dir/clim-recal-results/group_run_`date +%F-%H-%M`"

log_path="$output_path/logs"

# Temporary directories which will hold one year of data at a time
hads_working_dir="$output_path/working/HadsUKgrid"
cpm_working_dir="$output_path/working/UKCP2.2"


mkdir -p $hads_working_dir
mkdir -p $cpm_working_dir
mkdir -p $log_path

for i in $(seq 0 $max_index); do
echo "Running for index={$i}"

cpm_start_year=1982
cpm_end_year=1982
# cpm_end_year=1982

# First and last year that we have CPM data for
for year in $(seq $cpm_start_year $cpm_end_year); do
echo "Running for year={$year}"

# Including `1201` in the filter, guarantees that we only match on the
# start year for each file, not the end year.
cpm_filter="*_${year}1201-*.nc"
# cpm_filter="*_198?1201-*.nc"

# Copy the relevant CPM files into the working directory
# These options:
# 1. Maintain the directory structure
# 2. Include only the files that match the current year filter
# 3. Exclude all other files
rsync \
--include="$cpm_filter" \
--filter='-! */' \
--recursive \
--delete-excluded \
$cpm_input_path \
$cpm_working_dir

# Copy the HADS files into the working directory
hads_filter="*_${year}??01-*.nc"

rsync \
--include="$hads_filter" \
--filter='-! */' \
--recursive \
--delete-excluded \
$hads_input_path \
$hads_working_dir

{
clim-recal \
--start-index $i \
--total-from-index 1 \
--hads-input-path $hads_input_path \
--cpm-input-path $cpm_input_path \
--hads-input-path $hads_working_dir \
--cpm-input-path $cpm_working_dir \
--output-path $output_path \
--all-variables \
--all-regions \
--run 01 \
--run 05 \
--run 06 \
--run 07 \
--run 08 \
--execute
} 2>&1 | tee $log_path/log_$i.txt
} 2>&1 | tee $log_path/log_$year.txt

# Delete extraneous crop files
find $output_path -type d -name 'run_*' | xargs -I {} python $script_dir/remove-extra-cropfiles.py {} --I-am-really-sure-I-want-to-delete-lots-of-files

done
1 change: 1 addition & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ services:
- JUPYTER_ENABLE_LAB=yes
volumes:
- .:/home/jovyan:rw
- /datadrive:/datadrive

docs:
build:
Expand Down
7 changes: 4 additions & 3 deletions python/clim_recal/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@

CFCalendarSTANDARD: Final[str] = "standard"

RESAMPLING_OUTPUT_PATH: Final[PathLike] = (
CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample"
)
# RESAMPLING_OUTPUT_PATH: Final[PathLike] = (
# CLIMATE_DATA_MOUNT_PATH / "CPM-365/andys-two-gdal-step-approach/resample"
# )
RESAMPLING_OUTPUT_PATH: Final[PathLike] = "/datadrive/clim-recal-results/cropped"
RAW_HADS_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/HadsUKgrid"
RAW_CPM_PATH: Final[PathLike] = CLIMATE_DATA_MOUNT_PATH / "Raw/UKCP2.2"
RAW_HADS_TASMAX_PATH: Final[PathLike] = RAW_HADS_PATH / "tasmax/day"
Expand Down

0 comments on commit 49a2903

Please sign in to comment.