diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 2c11020e..dce7d149 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -23,6 +23,11 @@ jobs:
       uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Install geos  # required for cartopy installation
+      uses: ConorMacBride/install-package@v1
+      with:
+        brew: geos
+        apt: libgeos-dev
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 34acdc24..71a29293 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,12 +16,20 @@ Please read [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md) before contributing.
 
 ### Set-up
 
-| Set-up expectations | |
+Set-up expectations:
+
+|  | |
 | --- | --- |
 | Env manager: | conda |
 | Os: | macOS |
 | Package manager: | pip |
 
+Install Dependencies:
+
+0. There are a few dependencies that need to be installed in advance. See the [dependencies section](#markdown-header-dependencies) for specific instructions and then return here once they have been installed.
+
+Package set-up and installation:
+
 1. Setup a new conda env: `conda create -n r5py python=3.9.13`
 2. Activate the environment: `conda activate r5py`
 3. Launch terminal and change directory to wherever you keep your GitHub repos: `cd ~/Documents`
@@ -31,25 +39,35 @@ Please read [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md) before contributing.
 7. Update pip: `pip install --upgrade pip`
 8. Install r5py & other reqs: `pip install -r requirements.txt`
 
-**Java Development Kit**
-Openjdk 11 is recommended by r5py docs. [sdkman](https://sdkman.io/) is used here.
+Set-up check:
 
-9. Get sdkman: `curl -s "https://get.sdkman.io" | bash`
-10. Follow instructions in terminal, you’ll be asked to: `source “<SOME_PATH>.sdkman/bin/sdkman-init.sh"`
-This will add sdkman env variables to: `~/.bash_profile` and possibly `~/.zshrc` too.
-11. To see versions available on your os: `sdk list java`
-12. Install the required version using the `identifier` column, eg: `sdk install java 11.0.19-amzn`
-13. Check this is the currenty used java version: `sdk current java`
+10. Run set-up pytests: `pytest --runsetup`.
+11. If everything is working as expected, you should see some Java flavoured warnings about `--illegal-access` that you can ignore. But importantly look out for the message: `r5py has created the expected database files.`
+12. If you've made it this far, you've earned yourself a coffee.
 
-> Note: for macOS on ARM architectures (M1/M2 machines), sdkman does not currently provide a suitable openjdk11 version. Instead you can follow this [blog post, which introduces Java version mangaement using `jEnv`](https://blog.bigoodyssey.com/how-to-manage-multiple-java-version-in-macos-e5421345f6d0), and install AdoptOpenJDK-11. You can then continue with the remainder of the steps below.
+#### Dependencies
 
-**Set-up check**
+##### Geos
 
-14. Now you should be able to run the check setup script, from terminal: `python3 src/utils/check-setup.py`
-15. If everything is working as expected, you should see some Java flavoured warnings about `--illegal-access` that you can ignore. But importantly look out for the message: `r5py has created the expected database files.`
-16. Run set-up pytests: `pytest --runsetup`.
-17. If you've made it this far, you've earned yourself a coffee.
+`geos` is a dependency of the python package `cartopy`, which is used here for building and visualising static maps. See [this cartopy installation guidance note](https://github.com/SciTools/cartopy/blob/main/INSTALL) for more details on the installation process.
 
+For macOS, this is straight forward using `brew`:
+
+```console
+brew install geos
+```
+
+##### Java Development Kit
+Java is required for handling the transport network routing. Openjdk 11 is recommended by r5py docs. [sdkman](https://sdkman.io/) is used here.
+
+> Note: for macOS on ARM architectures (M1/M2 machines), sdkman does not currently provide a suitable openjdk11 version. Instead you can follow this [blog post, which introduces Java version mangaement using `jEnv`](https://blog.bigoodyssey.com/how-to-manage-multiple-java-version-in-macos-e5421345f6d0), and install AdoptOpenJDK-11.
+
+1. Get sdkman: `curl -s "https://get.sdkman.io" | bash`
+2. Follow instructions in terminal, you’ll be asked to: `source “<SOME_PATH>.sdkman/bin/sdkman-init.sh"`
+This will add sdkman env variables to: `~/.bash_profile` and possibly `~/.zshrc` too.
+3. To see versions available on your os: `sdk list java`
+4. Install the required version using the `identifier` column, eg: `sdk install java 11.0.19-amzn`
+5. Check this is the currenty used java version: `sdk current java`.
 
 ### Pre-commit
 
diff --git a/notebooks/population/grid_pop_experiment.py b/notebooks/population/grid_pop_experiment.py
new file mode 100644
index 00000000..f680fa12
--- /dev/null
+++ b/notebooks/population/grid_pop_experiment.py
@@ -0,0 +1,1237 @@
+# %%
+"""[EXPERIMENTAL] This notebook aims to explore gridded population data.
+
+It uses GHSL - Global Human Settlement Layer - for the population input.
+Specifically, GHS-POP-R2023A for the year 2020 and 100x100m population grids
+[1]. Coverage, in this case, is most of the UK [2]. The data is cropped to a
+region of interest (Newport) and then resampled to 200x200m grids.
+
+It requires the dataset [2] to be unzipped, with the .tif file sorted within
+the data/external/population/ directory, named GHSL_2020_UK.tif.
+
+Additionally, an experiment is performed by merging together multiple GHSL-POP
+rasters to cover UK and France [2-5]. These should be stored in the same
+directory as above, but without any filename changes.
+
+A similar experiment is performed using 2015 GHSL-POP data [6-9].
+
+References
+----------
+- [1] Schiavina M., Freire S., Carioli A., MacManus K. (2023):
+GHS-POP R2023A - GHS population grid multitemporal (1975-2030).European Commiss
+ion, Joint Research Centre (JRC)
+PID: http://data.europa.eu/89h/2ff68a52-5b5b-4a22-8f40-c41da8332cfe, doi:10.290
+5/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE
+- [2] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2020_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2020_GLOBE_R2023A_54
+009_100_V1_0_R3_C18.zip
+- [3] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2020_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2020_GLOBE_R2023A_54
+009_100_V1_0_R3_C19.zip
+- [4] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2020_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2020_GLOBE_R2023A_54
+009_100_V1_0_R4_C18.zip
+- [5] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2020_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2020_GLOBE_R2023A_54
+009_100_V1_0_R4_C19.zip
+- [6] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2015_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2015_GLOBE_R2023A_54
+009_100_V1_0_R3_C18.zip
+- [7] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2015_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2015_GLOBE_R2023A_54
+009_100_V1_0_R3_C19.zip
+- [8] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2015_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2015_GLOBE_R2023A_54
+009_100_V1_0_R4_C18.zip
+- [9] https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023
+A/GHS_POP_E2015_GLOBE_R2023A_54009_100/V1-0/tiles/GHS_POP_E2015_GLOBE_R2023A_54
+009_100_V1_0_R4_C19.zip
+
+"""
+
+# %%
+import rasterio as rio
+import os
+import sys
+import logging
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import cartopy.crs as ccrs
+import cartopy.io.img_tiles as cimgt
+import rioxarray
+import xarray as xr
+import requests
+import textwrap
+
+from datetime import datetime
+from pyproj import Transformer
+from pyprojroot import here
+from matplotlib import colormaps
+from matplotlib.colors import LogNorm
+from matplotlib.ticker import AutoMinorLocator
+from rasterio.warp import Resampling
+from geocube.vector import vectorize
+from rioxarray.merge import merge_arrays
+from rioxarray.exceptions import NoDataInBounds
+from requests.exceptions import HTTPError
+from shapely.geometry.polygon import Polygon
+from xarray import DataArray
+
+
+# %%
+def setup_logger(
+    logger_name: str,
+    level: int = logging.INFO,
+    file_name: str = None,
+) -> logging.Logger:
+    """Build a logger instance.
+
+    Parameters
+    ----------
+    logger_name : str
+        name of logger.
+    level : int, optional
+        logger level (e.g., logging.DEBUG, logging.WARNING etc.), by default
+        logging.INFO.
+    file_name : str, optional
+        logger filename, if needed to write logs to file, by default None
+        meaning log messages will not be written to file.
+
+    Returns
+    -------
+    logging.Logger
+        a logger instance with the requested properties.
+
+    """
+    # set up the logger and logging level
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(level)
+
+    # fix the logger format
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+
+    # set up a stream handler
+    sh = logging.StreamHandler(sys.stdout)
+    sh.setFormatter(formatter)
+    logger.handlers.clear()
+    logger.addHandler(sh)
+
+    # set up logger file handler
+    if file_name:
+        fh = logging.FileHandler(file_name)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+    return logger
+
+
+def reproj_bbox(bbox: tuple, crs_from: str, crs_to: str) -> tuple:
+    """Reproject a bounding box from one CRS to another.
+
+    Parameters
+    ----------
+    bbox : tuple
+        Bounding box co-ordinates, in the same CRS are `crs_from`. This is the
+        bounding box to reproject into `crs_to`. Bounding box coordinate order
+        must be (left, bottom, right, top).
+    crs_from : str
+        CRS to convert from.
+    crs_to : str
+        CRS to convert to.
+
+    Returns
+    -------
+    tuple
+        `bbox` returned in CRS set by `crs_to`.
+
+    """
+    # create transformer and reproject bbox of interest into a window
+    transformer = Transformer.from_crs(crs_from, crs_to)
+    window_bl = transformer.transform(bbox[1], bbox[0])
+    window_tr = transformer.transform(bbox[3], bbox[2])
+
+    # return a single tuple in left, bottom, right, top order
+    window = window_bl + window_tr
+
+    return window
+
+
+# %%
+# population source file
+SRC_FILENAME = "GHSL_2020_UK.tif"
+MERGED_SRC_FILENAME = "GHSL_2020_merged.tif"
+MERGED_SRC_FILENAME_2015 = "GHSL_2015_merged.tif"
+SRC_DIR = os.path.join(here(), "data", "external", "population", SRC_FILENAME)
+MERGED_SRC_DIR = os.path.join(
+    here(), "data", "interim", "population", MERGED_SRC_FILENAME
+)
+MERGED_SRC_DIR_2015 = os.path.join(
+    here(), "data", "interim", "population", MERGED_SRC_FILENAME_2015
+)
+
+# logger name, level and set-up
+LOGGER_NAME = "pop-exploration"
+LOGGER_LEVEL = logging.INFO
+logger = setup_logger(LOGGER_NAME, level=LOGGER_LEVEL)
+
+# bounding box around area of interest in EPGS:4326
+# TODO: tidy up this section
+# BBOX = (-3.205023, 51.503789, -2.725744, 51.680820)  # orig BBOX
+# BBOX = (-3.206023, 51.503789, -2.726744, 51.680820)  # orig shifted by 100m
+# BBOX = (-2.210408, 53.450903, -0.861837, 54.093681)  # leeds
+# BBOX = (-1.054688, 51.134555, 0.873413, 51.835778)  # london
+# BBOX = (3.319416, 42.673743, 7.873249, 44.671964)  # marseille
+
+# area of interest
+AREA_OF_INTEREST = "newport"
+
+# get bbox of interest
+BBOX_DICT = {
+    "newport": (-3.206023, 51.503789, -2.726744, 51.680820),
+    "leeds": (-2.212408, 53.450803, -0.862837, 54.095581),
+    "london": (-1.054688, 51.134555, 0.873413, 51.835778),
+    "marseille": (3.319416, 42.673743, 7.873249, 44.671964),
+}
+BBOX = BBOX_DICT[AREA_OF_INTEREST]
+
+# nomis urls for mye populations
+NOMIS_URLS = {
+    "newport": (
+        "https://www.nomisweb.co.uk/api/v01/dataset/NM_2010_1.data.json?geogra"
+        "phy=1254276446...1254276882,1254277923...1254277960&date=latest&gende"
+        "r=0&c_age=200&measures=20100"
+    ),
+    "leeds": (
+        "https://www.nomisweb.co.uk/api/v01/dataset/NM_2010_1.data.json?geogra"
+        "phy=1254151943...1254154269,1254258198...1254258221,1254261711...1254"
+        "261745,1254261853...1254261870,1254261894...1254261918,1254262125...1"
+        "254262142,1254262341...1254262353,1254262394...1254262398,1254262498."
+        "..1254262532,1254262620...1254262658,1254262922...1254262925&date=lat"
+        "est&gender=0&c_age=200&measures=20100"
+    ),
+    "london": (
+        "https://www.nomisweb.co.uk/api/v01/dataset/NM_2010_1.data.json?geogra"
+        "phy=1254096897...1254098287,1254098289...1254098739,1254098741...1254"
+        "099990,1254099992...1254100115,1254100117...1254100340,1254100342...1"
+        "254100465,1254100467,1254100476...1254100544,1254100546...1254100658,"
+        "1254100660...1254102034,1254102068...1254102153,1254102185...12541022"
+        "80,1254102282...1254103792,1254103795,1254103797...1254107351,1254107"
+        "354...1254107963,1254107965...1254108049,1254108051...1254108089,1254"
+        "108091...1254108146,1254108148...1254108290,1254108292...1254108501,1"
+        "254108503...1254108514,1254108516...1254108735,1254108761...125411136"
+        "2,1254111364...1254115627,1254115630...1254116999,1254117001...125412"
+        "0302,1254204377,1254258303...1254258311,1254258313...1254258324,12542"
+        "58365,1254258511...1254258526,1254258573...1254258581,1254259015...12"
+        "54259033,1254259139...1254259180,1254259193...1254259246,1254259398.."
+        ".1254259617,1254259767...1254259799,1254259957...1254259964,125426002"
+        "7...1254260037,1254260245...1254260280,1254260330...1254260332,125426"
+        "0420...1254260439,1254261069...1254261098,1254262366...1254262393,125"
+        "4262935...1254262988,1254263053...1254263088,1254263090...1254263100,"
+        "1254263102...1254263106,1254263114...1254263139,1254263141...12542631"
+        "51,1254263436...1254263494,1254263661...1254263669,1254265357...12542"
+        "65363,1254265589...1254265605,1254265661...1254265680,1254266095...12"
+        "54266099,1254266191...1254266347,1254266360...1254266773,1254267710.."
+        ".1254267747,1254267749...1254267811,1254267813...1254267830,125426794"
+        "9...1254268089&date=latest&gender=0&c_age=200&measures=20100"
+    ),
+    "marseille": None,
+}
+NOMIS_URL = NOMIS_URLS[AREA_OF_INTEREST]
+
+# resammpling scaling factor
+RESAMPLING_SCALE_FACTOR = 2
+
+# minimum population threshold for plotting
+MIN_PLOT_THRESH = 10
+
+# attributions - used during plotting
+POPULATION_ATTR = "GHS-POP 2020 (R2023) "
+BASE_MAP_ATTR = "(C) OpenSteetMap contributors"
+ONS_MYE_ATTR = "ONS, 2020 (via nomisweb)"
+
+# merge file list - in data/external/population/
+MERGE_FILE_LIST = [
+    "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0_R3_C18.tif",
+    "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0_R3_C19.tif",
+    "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0_R4_C18.tif",
+    "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0_R4_C19.tif",
+]
+
+# merge file list for 2015 epoch - used to merge 2015 data sources
+MERGE_FILE_LIST_2015 = [
+    "GHS_POP_E2015_GLOBE_R2023A_54009_100_V1_0_R3_C18.tif",
+    "GHS_POP_E2015_GLOBE_R2023A_54009_100_V1_0_R3_C19.tif",
+    "GHS_POP_E2015_GLOBE_R2023A_54009_100_V1_0_R4_C18.tif",
+    "GHS_POP_E2015_GLOBE_R2023A_54009_100_V1_0_R4_C19.tif",
+]
+
+# %%
+# read in source crs to convert bounds of window
+with rio.open(SRC_DIR) as src:
+    window = reproj_bbox(BBOX, "EPSG:4326", src.crs.to_string())
+
+# %%
+# build a geometry representing the area of interest
+geometries = [
+    {
+        "type": "Polygon",
+        "coordinates": [
+            [
+                [window[0], window[1]],
+                [window[2], window[1]],
+                [window[2], window[3]],
+                [window[0], window[3]],
+            ]
+        ],
+    }
+]
+
+# %%
+# open data and clip to the above geometry, using from disk (more performant)
+xds = rioxarray.open_rasterio(SRC_DIR, masked=True).rio.clip(
+    geometries, from_disk=True, all_touched=True
+)
+
+# set the variable name of the data to be population
+xds.name = "population"
+
+# plot data and show resolution
+xds.plot()
+print(xds.rio.resolution())
+
+# %%
+# show a histogram
+xds.plot.hist(bins=10)
+
+# %%
+# resample based on scaling factor and using sum resampling
+xds_resampled = xds.rio.reproject(
+    xds.rio.crs,
+    resolution=tuple(
+        res * RESAMPLING_SCALE_FACTOR for res in xds.rio.resolution()
+    ),
+    resampling=Resampling.sum,
+)
+
+print(xds_resampled.rio.nodata)
+xds_resampled.rio.transform()
+
+# %%
+# use geocube to conver raster to geopandas df
+gdf = vectorize(xds_resampled.squeeze().astype(np.float32))
+
+# %%
+# visualise the results
+gdf[gdf["population"] >= MIN_PLOT_THRESH].explore(
+    "population",
+    tiles="https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png",
+    attr=(
+        '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMa'
+        'p</a> contributors &copy; <a href="https://carto.com/attributions">CA'
+        "RTO</a>"
+    ),
+    control_scale=True,
+    zoom_control=True,
+)
+
+# %%
+# get OpenStreetMap tile layer
+map_tile = cimgt.OSM(desired_tile_form="L")
+
+# build plot axis and add map tile
+ax = plt.axes(projection=map_tile.crs)
+ax.figure.set_size_inches(8, 10)
+data_crs = ccrs.Mollweide()
+ax.add_image(map_tile, 11, cmap="gray")
+
+# build a colormap and add pcolormesh plot
+cmap = colormaps.get_cmap("viridis")
+cmap.set_under(alpha=0)
+
+# build a colormap and add pcolormesh plot
+plt_resampled_rst = np.copy(xds_resampled.squeeze())
+plt_resampled_rst[plt_resampled_rst < MIN_PLOT_THRESH] = 0
+ctf = ax.pcolormesh(
+    xds_resampled.squeeze().x.to_numpy(),
+    xds_resampled.squeeze().y.to_numpy(),
+    plt_resampled_rst,
+    cmap=cmap,
+    vmin=MIN_PLOT_THRESH,
+    transform=data_crs,
+)
+
+# add a colorbar - set base to notify not displaying 0 population
+cbar = plt.colorbar(ctf, ax=ax, fraction=0.034, pad=0.04)
+cbar.ax.set_ylabel("Population count per cell", rotation=270, labelpad=20)
+cbar.set_ticks(
+    np.concatenate([np.array([MIN_PLOT_THRESH]), cbar.get_ticks()[1:-1]])
+)
+
+# create an attribution string and add it to the axis
+grid_res = xds_resampled.rio.resolution()
+attribution = f"""
+Generated on: {datetime.strftime(datetime.now(), "%Y-%m-%d")}
+Population data: {POPULATION_ATTR}
+Grid Size: {abs(grid_res[0])}m x {abs(grid_res[1])}m
+Base map: {BASE_MAP_ATTR}"""
+ax.text(
+    0.01,
+    0.01,
+    attribution,
+    transform=ax.transAxes,
+    size=8,
+    wrap=True,
+    fontdict={"name": "Arial", "color": "#000000"},
+    va="bottom",
+    ha="left",
+)
+
+# show plot
+plt.tight_layout()
+plt.show()
+
+# %%
+# get the interim population directory
+INTERIM_DIR = os.path.dirname(SRC_DIR).replace("external", "interim")
+
+# make one if it does not exist
+if not os.path.exists(INTERIM_DIR):
+    os.mkdir(INTERIM_DIR)
+
+# create full filepath for cropped and resampeld tif file
+RESAMPLED_DIR = os.path.join(
+    INTERIM_DIR,
+    os.path.basename(SRC_DIR).replace(".tif", "_xarray_cropped_resampled.tif"),
+)
+
+# write to GeoTIFF raster file
+# note: nodata arg will be reset to original value by rioxarry when writing
+xds_resampled.rio.to_raster(RESAMPLED_DIR)
+
+# %%
+# re-open save file (as a check)
+xds_res = rioxarray.open_rasterio(RESAMPLED_DIR, masked=True)
+
+# set the variable name of the data to be population
+xds_res.name = "population"
+
+# plot data and show resolution
+xds_res.plot()
+
+# %%
+# build merge file directories
+merge_dirs = [
+    os.path.join(here(), "data", "external", "population", name)
+    for name in MERGE_FILE_LIST
+]
+
+# %%
+# build mere dataset list
+arrays = []
+for merge_dir in merge_dirs:
+    arrays.append(rioxarray.open_rasterio(merge_dir, masked=True))
+
+# display bounds for checking
+for array in arrays:
+    logger.info(array.rio.bounds())
+
+# %%
+# merge the datasets together
+xds_merged = merge_arrays(arrays)
+logger.info(xds_merged.rio.bounds())
+
+# %%
+# plot merged data - note: this cell takes around 3.5mins to run
+xds_merged.plot(vmin=MIN_PLOT_THRESH, norm=LogNorm())
+
+# %%
+# get the interim population directory
+INTERIM_POP_DIR = os.path.join(here(), "data", "interim", "population")
+
+# make dir if it does not exist
+if not os.path.exists(INTERIM_POP_DIR):
+    os.mkdir(INTERIM_POP_DIR)
+
+# create full filepath for merged tif file
+MERGED_DIR = os.path.join(INTERIM_POP_DIR, "GHSL_2020_merged.tif")
+
+# write to GeoTIFF raster file
+xds_merged.rio.to_raster(MERGED_DIR)
+
+# %%
+# resample merged data based on scaling factor and using sum resampling
+# note: this cell takes around 1.75 minutes
+xds_merged_resampled = xds_merged.rio.reproject(
+    xds_merged.rio.crs,
+    resolution=tuple(
+        res * RESAMPLING_SCALE_FACTOR for res in xds_merged.rio.resolution()
+    ),
+    resampling=Resampling.sum,
+)
+
+# %%
+# create full filepath for merged and resampeld tif file
+MERGED_RESAMPLED_DIR = os.path.join(
+    INTERIM_POP_DIR, "GHSL_2020_merged_200.tif"
+)
+
+# write to GeoTIFF raster file
+xds_merged_resampled.rio.to_raster(MERGED_RESAMPLED_DIR)
+
+# %%
+# use merged dataset herein
+# open data and clip to the above geometry, using from disk (more performant)
+xds_clip = rioxarray.open_rasterio(MERGED_RESAMPLED_DIR, masked=True).rio.clip(
+    geometries, from_disk=True, all_touched=True
+)
+
+# set the variable name of the data to be population
+xds_clip.name = "population"
+
+# plot data and show resolution
+xds_clip.plot()
+
+# %%
+# use geocube to conver raster to geopandas df
+gdf = vectorize(xds_clip.squeeze().astype(np.float32))
+
+# %%
+# TODO: need to functionise
+# visualise the results
+m = gdf[gdf["population"] >= MIN_PLOT_THRESH].explore(
+    "population",
+    tiles="https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png",
+    attr=(
+        '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMa'
+        'p</a> contributors &copy; <a href="https://carto.com/attributions">CA'
+        "RTO</a>"
+    ),
+    control_scale=True,
+    zoom_control=True,
+)
+
+# define and outputs directory
+POP_OUTPUTS_DIR = os.path.join(here(), "outputs", "population")
+
+# make one if it does not exist
+if not os.path.exists(POP_OUTPUTS_DIR):
+    os.mkdir(POP_OUTPUTS_DIR)
+
+# save to file
+m.save(os.path.join(POP_OUTPUTS_DIR, f"{AREA_OF_INTEREST}.html"))
+
+# %%
+# TODO: need to functionise
+# get OpenStreetMap tile layer
+map_tile = cimgt.OSM(desired_tile_form="L")
+
+# build plot axis and add map tile
+ax = plt.axes(projection=map_tile.crs)
+ax.figure.set_size_inches(8, 10)
+data_crs = ccrs.Mollweide()
+ax.add_image(map_tile, 12, cmap="gray")
+
+# build a colormap and add pcolormesh plot
+cmap = colormaps.get_cmap("viridis")
+cmap.set_under(alpha=0)
+
+# build a colormap and add pcolormesh plot
+plt_resampled_rst = np.copy(xds_clip.squeeze())
+plt_resampled_rst[plt_resampled_rst < MIN_PLOT_THRESH] = 0
+ctf = ax.pcolormesh(
+    xds_clip.squeeze().x.to_numpy(),
+    xds_clip.squeeze().y.to_numpy(),
+    plt_resampled_rst,
+    cmap=cmap,
+    vmin=MIN_PLOT_THRESH,
+    transform=data_crs,
+)
+
+# add a colorbar - set base to notify not displaying 0 population
+cbar = plt.colorbar(ctf, ax=ax, fraction=0.034, pad=0.04)
+cbar.ax.set_ylabel("Population count per cell", rotation=270, labelpad=20)
+cbar.set_ticks(
+    np.concatenate([np.array([MIN_PLOT_THRESH]), cbar.get_ticks()[1:-1]])
+)
+
+# create an attribution string and add it to the axis
+grid_res = xds_clip.rio.resolution()
+attribution = f"""
+Generated on: {datetime.strftime(datetime.now(), "%Y-%m-%d")}
+Population data: {POPULATION_ATTR}
+Grid Size: {abs(grid_res[0])}m x {abs(grid_res[1])}m
+Base map: {BASE_MAP_ATTR}"""
+ax.text(
+    0.01,
+    0.01,
+    attribution,
+    transform=ax.transAxes,
+    size=8,
+    wrap=True,
+    fontdict={"name": "Arial", "color": "#000000"},
+    va="bottom",
+    ha="left",
+)
+
+# show plot
+plt.tight_layout()
+plt.show()
+
+# %%
+# check equivalence of orignal method
+# note ignoring last row/col since BBOX isn't always divisible by 200m when
+# reading in 100x100m grid to resample it effectively (undersums that row/col)
+assert xds_clip.rio.transform() == xds_resampled.rio.transform()
+assert xds_clip.rio.bounds() == xds_resampled.rio.bounds()
+assert xds_clip.shape == xds_resampled.shape
+assert np.array_equal(
+    xds_resampled[:, :-1, :-1].to_numpy(),
+    xds_clip[:, :-1, :-1].to_numpy(),
+    equal_nan=True,
+)
+
+# %%
+# sense check values against mid-year estimates
+# following cells can be run independently after loading in the window for the
+# area of interest
+
+# %%
+# open data and clip to the above geometry, using from disk (more performant)
+xds_sc = rioxarray.open_rasterio(MERGED_SRC_DIR, masked=True).rio.clip(
+    geometries, from_disk=True, all_touched=True
+)
+
+# set the variable name of the data to be population
+xds_sc.name = "population"
+
+# plot data and show resolution
+xds_sc.plot()
+
+# %%
+
+
+# read in nomis population data for area of interest
+def get_nomis_myes(nomis_url: str, obs_col_name: str = "mye") -> pd.DataFrame:
+    """Get mid-year population estimates from nomis.
+
+    Parameters
+    ----------
+    nomis_url : str
+        url to nomis end point for the data
+    obs_col_name : str, optional
+        name of mye column, by default "mye"
+
+    Returns
+    -------
+    pd.DataFrame
+        nomis myes as a pandas dataframe
+
+    """
+    try:
+        response = requests.get(nomis_url)
+        response.raise_for_status()
+    except HTTPError as he:
+        logger.error(f"A HTTP error occured: {he}")
+    except Exception as e:
+        logger.error(f"An error occured when reading nomis data {e}")
+
+    pops = []
+    for obs in response.json()["obs"]:
+        pops.append([obs["geography"]["geogcode"], obs["obs_value"]["value"]])
+
+    pop_df = pd.DataFrame(pops, columns=["OA11CD", obs_col_name])
+
+    return pop_df
+
+
+# get 2020 mid-year estimates
+logger.info("Retrieving MYEs 2020...")
+pop_df = get_nomis_myes(NOMIS_URL, "mye")
+
+# get 2011 census estimates
+logger.info("Retrieving 2011 census estimates...")
+pop_df = pop_df.merge(
+    get_nomis_myes(NOMIS_URL.replace("date=latest", "date=2011"), "census"),
+    on="OA11CD",
+)
+
+# %%
+# read in ONS geoportal boundaries
+active_response = True
+result_offset = 0
+oa_bounds = []
+
+# repeat calls untill all the boundaries are retrieved - 2000 per batch
+while active_response:
+    try:
+        response = requests.get(
+            "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/service"
+            "s/Output_Areas_Dec_2011_Boundaries_EW_BFC_2022/FeatureServer/0/qu"
+            "ery?where=1%3D1&outFields=OA11CD&geometry="
+            f"{BBOX[0]}%2C{BBOX[1]}%2C{BBOX[2]}%2C{BBOX[3]}&geometryType=esriG"
+            "eometryEnvelope&inSR=4326&spatialRel=esriSpatialRelContains&outSR"
+            f"=4326&f=geojson&resultOffset={result_offset}"
+        )
+        response.raise_for_status()
+    except HTTPError as he:
+        logger.error(f"A HTTP error occured: {he}")
+    except Exception as e:
+        logger.error(f"An error occured when reading nomis data {e}")
+
+    if response.json()["features"] != []:
+        oa_bounds.append(
+            gpd.GeoDataFrame.from_features(response.json(), crs="EPSG:4326")
+        )
+        result_offset += 2000
+    else:
+        active_response = False
+
+# %%
+# connvert response to geodataframe
+oa_bounds_gdf = pd.concat(oa_bounds)
+
+# match the crs with the raster data
+oa_bounds_gdf = oa_bounds_gdf.to_crs(src.crs.to_string())
+oa_bounds_gdf.plot()
+
+# %%
+# merege on population data and filter to OAs only in pop data
+oa_gdf = oa_bounds_gdf.merge(pop_df, on="OA11CD", how="right")
+assert len(pop_df) == len(oa_gdf)
+
+oa_gdf.plot("mye", legend=True)
+
+# %%
+
+
+def clip_sum(geometry: Polygon, rst: DataArray) -> float:
+    """Clip rastered data to geometry and estimate population.
+
+    Parameters
+    ----------
+    geometry : Polygon
+        Polygon representing the bounds to clip to. Must be in the same CRS are
+        the raster data.
+    rst : DataArray
+        Raster data array
+
+    Returns
+    -------
+    float
+        Sum values within clip, excluding nan values.
+
+    """
+    try:
+        rst_clipped = rst.rio.clip([geometry]).to_numpy()
+    except NoDataInBounds:
+        return 0
+    return np.nansum(rst_clipped)
+
+
+def clip_sum_overlap(geometry: Polygon, rst: DataArray) -> float:
+    """Clip raster data to geometry and estimate population.
+
+    Method includes all cells that touch the provided geometry. Then scales
+    the cell value based on the fraction of overlap between the cell and the
+    geometry. The total is then then combined sum of these fractional cell
+    values.
+
+    Parameters
+    ----------
+    geometry : Polygon
+        Polygon representing the bounds to clip to. Must be in the same CRS are
+        the raster data.
+    rst : DataArray
+        Raster data array
+
+    Returns
+    -------
+    float
+        Sum values within clip, excluding nan values, and adjusting for
+        fractional overlap of cells/geometry.
+
+    """
+    # clip to geometry and include all cells that touch
+    try:
+        rst_clipped = rst.rio.clip([geometry], all_touched=True)
+    except NoDataInBounds:
+        return 0
+
+    # vectorise and drop masked/null cells (nodata regions)
+    try:
+        clipped_gdf = vectorize(rst_clipped.squeeze().astype(np.float32))
+    except ValueError:
+        # TODO: use this approach, should work for all conditions
+        clipped_gdf = vectorize(rst_clipped.squeeze(axis=0).astype(np.float32))
+    clipped_gdf.dropna(subset=["population"], inplace=True)
+
+    # calculate the fractional overlap within the OA
+    clipped_gdf["overlap"] = (
+        clipped_gdf.intersection(geometry).area / clipped_gdf.area
+    )
+
+    # calculate the OA estimate by scaling the cell population by the
+    # fractional overlap
+    oa_estimate = (clipped_gdf["overlap"] * clipped_gdf["population"]).sum()
+
+    return oa_estimate
+
+
+# %%
+# apply the clip_sum_overlap func to all OAs and calculate the difference
+oa_gdf["ghs"] = oa_gdf.geometry.apply(clip_sum_overlap, args=[xds_sc])
+oa_gdf["pop_diff"] = oa_gdf["mye"] - oa_gdf["ghs"]
+
+# %%
+# repeat above, but exclude cells below a threshold
+threshs = [1, 3, 5, 7, 10]
+for thresh in threshs:
+    logger.info(f"Calculating GHS-POP estimate using {thresh}...")
+    xds_thresh = xds_sc.copy()
+    xds_thresh = xds_thresh.where(xds_thresh > thresh)
+    oa_gdf[f"ghs_thresh_{thresh}"] = oa_gdf.geometry.apply(
+        clip_sum_overlap, args=[xds_thresh]
+    )
+    oa_gdf[f"pop_diff_thresh_{thresh}"] = (
+        oa_gdf["mye"] - oa_gdf[f"ghs_thresh_{thresh}"]
+    )
+
+# %%
+# build an interactive folium map displaying OA pop estimate difference
+m = oa_gdf.explore(
+    "pop_diff",
+    legend=True,
+    cmap="RdBu",
+    vmin=-600,
+    vmax=600,
+    tiles="https://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}{r}.png",
+    attr=(
+        '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMa'
+        'p</a> contributors &copy; <a href="https://carto.com/attributions">CA'
+        "RTO</a>"
+    ),
+    legend_kwds={"backgroundcolor": "white"},
+)
+
+# create dir if it doesn't already exist
+plot_out_path = os.path.join(here(), "outputs", "mye")
+if not os.path.exists(plot_out_path):
+    os.mkdir(plot_out_path)
+
+m.save(os.path.join(plot_out_path, f"{AREA_OF_INTEREST}.html"))
+del m
+
+# %%
+# build a beeswarn to display distribution of OA differences
+fig, ax = plt.subplots(figsize=(16, 9))
+
+# for visualisation purposes, do not display extreme outlies
+plot_lim = 1000
+plot_col = "pop_diff"
+plot_df = oa_gdf[
+    (oa_gdf[plot_col] < plot_lim) & ((oa_gdf[plot_col] > -plot_lim))
+]
+
+# add a colored region to show 10-90 percentile range
+perc_low, perc_high = np.percentile(oa_gdf[plot_col], [5, 95])
+p_range = perc_high - perc_low
+ax.axvspan(
+    perc_low,
+    perc_high,
+    color="gold",
+    alpha=0.5,
+    label=(
+        f"Percentile Range (10-90):\n{p_range:.0f} [{perc_low:.0f} to "
+        f"{perc_high:.0f}]"
+    ),
+)
+
+# add a colored region to show IQR
+iqr_low, iqr_high = np.percentile(oa_gdf[plot_col], [25, 75])
+iqr = iqr_high - iqr_low
+ax.axvspan(
+    iqr_low,
+    iqr_high,
+    color="darkorange",
+    alpha=0.5,
+    label=f"IQR: {iqr:.0f} [{iqr_low:.0f} to {iqr_high:.0f}]",
+)
+
+# add a line to show the median difference
+median = oa_gdf[plot_col].median()
+ax.axvline(median, color="red", linestyle="--", label=f"Median: {median:.0f}")
+
+# add beswarm plot - note it can take 10/20s to plot
+# TODO: automatically asign size
+sns.swarmplot(plot_df, x=plot_col, size=3, ax=ax, label="OA Pop. Difference")
+
+# fix the scales to the nearest 1000
+limit = np.abs(ax.get_xlim()).max()
+limit = round(limit / 1e3) * 1e3
+ax.set_xticks(np.arange(-limit, limit + 200, 200))
+
+# add minor grids and adjust formatting of y axis
+ax.xaxis.grid()
+minor_locator = AutoMinorLocator(2)
+ax.xaxis.set_minor_locator(minor_locator)
+ax.xaxis.grid(which="minor", linestyle="--", alpha=0.5)
+ax.set_yticks([])
+
+# add an indicator to show the 'side' of the greater estimate
+indicator = ax.text(
+    0,
+    -0.45,
+    "Greater  Estimate\nGHS-POP ◀---       ---▶ ONS-MYE",
+    va="center",
+    ha="center",
+)
+indicator.set_bbox(dict(facecolor="white", edgecolor="black", alpha=0.9))
+
+# create an attribution string and add it to the axis
+grid_res = xds_sc.rio.resolution()
+attribution = f"""
+Generated on: {datetime.strftime(datetime.now(), "%Y-%m-%d")}
+Gridded pop. estimates: {POPULATION_ATTR}
+Grid Size: {abs(grid_res[0])}m x {abs(grid_res[1])}m
+Mid year pop. estimates: {ONS_MYE_ATTR}"""
+ax.text(
+    0.01,
+    0.01,
+    attribution,
+    transform=ax.transAxes,
+    size=8,
+    wrap=True,
+    fontdict={"name": "Arial", "color": "#000000"},
+    va="bottom",
+    ha="left",
+)
+
+# rename x axis
+ax.set_xlabel(
+    "Difference between ONS MYE and GHS-POP Output Area Population Estimate"
+)
+
+# reorder legend
+handles, labels = ax.get_legend_handles_labels()
+ax.legend(handles[::-1], labels[::-1])
+
+# if extereme outlier exist (positive) add text to show values
+if oa_gdf[oa_gdf[plot_col] >= plot_lim][plot_col].any():
+    pos_fliers = oa_gdf[oa_gdf[plot_col] >= plot_lim][plot_col].to_list()
+    pos_fliers = [f"{x:.0f}" for x in sorted(pos_fliers)]
+    pos_fliers_text = ", ".join(pos_fliers)
+    pos_indicator = ax.text(
+        0.98,
+        0.65,
+        f"Not displayed ---▶\n{textwrap.fill(pos_fliers_text, 24)}",
+        va="center",
+        ha="right",
+        transform=ax.transAxes,
+    )
+    pos_indicator.set_bbox(
+        dict(facecolor="white", edgecolor="white", alpha=0.5)
+    )
+
+# if extereme outlier exist (negative) add text to show values
+if oa_gdf[oa_gdf[plot_col] <= -plot_lim][plot_col].any():
+    neg_fliers = oa_gdf[oa_gdf[plot_col] <= -plot_lim][plot_col].to_list()
+    neg_fliers = [f"{x:.0f}" for x in sorted(neg_fliers)]
+    neg_fliers_text = ", ".join(neg_fliers)
+    neg_indicator = ax.text(
+        0.02,
+        0.65,
+        f"◀--- Not displayed\n{neg_fliers_text}",
+        va="center",
+        ha="left",
+        transform=ax.transAxes,
+    )
+    neg_indicator.set_bbox(
+        dict(facecolor="white", edgecolor="white", alpha=0.5)
+    )
+
+plt.show()
+
+# %%
+# build boxplots for displaying effect of pop threshold - melt data first
+pop_diff_cols = [col for col in oa_gdf.columns if col.startswith("pop_diff")]
+plot_df = oa_gdf[["OA11CD"] + pop_diff_cols].melt(
+    id_vars="OA11CD",
+    value_vars=pop_diff_cols,
+    var_name="Method",
+    value_name="Difference",
+)
+
+fig, ax = plt.subplots(figsize=(16, 9))
+sns.boxplot(plot_df, x="Difference", y="Method", ax=ax, orient="h")
+
+ax.set_xlabel(
+    "Difference between ONS MYE and GHS-POP Output Area Population Estimate"
+)
+plt.show()
+
+# %%
+# build choropleth maps for MYE and GHS estimates
+fig, axes = plt.subplots(1, 2, figsize=(16, 9))
+
+# get consitent min and max to ensure consistent colorscales
+vmin = min(oa_gdf["mye"].min(), oa_gdf["ghs"].min())
+vmax = max(oa_gdf["mye"].max(), oa_gdf["ghs"].max())
+
+# plot both estimates
+for ax, col in zip(axes.flat, ["mye", "ghs"]):
+    oa_gdf.to_crs("EPSG:4326").plot(
+        col,
+        ax=ax,
+        vmin=vmin,
+        vmax=vmax,
+    )
+    ax.axis("off")
+    ax.set_title(col.upper())
+
+# make a common colorbar at the bottom of the figure
+fig = ax.get_figure()
+cax = fig.add_axes([0.2, 0.2, 0.6, 0.03])
+sm = plt.cm.ScalarMappable(
+    cmap="viridis", norm=plt.Normalize(vmin=vmin, vmax=vmax)
+)
+cbar = fig.colorbar(sm, cax=cax, orientation="horizontal")
+cbar.ax.set_xlabel("OA Population")
+
+plt.show()
+
+# %%
+# build a choropleth map to display the difference
+fig, ax = plt.subplots(figsize=(16, 9))
+
+cbar = oa_gdf.to_crs("EPSG:4326").plot(
+    "pop_diff",
+    ax=ax,
+    cmap="RdBu",
+    vmin=-600,
+    vmax=600,
+    legend=True,
+    legend_kwds={
+        "label": "Difference (MYE-GHS)",
+    },
+)
+
+# set the background to grey since the mid point of the colorscale is white
+ax.set_facecolor("grey")
+
+# hide all the tick marks and labels
+ax.xaxis.set_tick_params(labelbottom=False)
+ax.yaxis.set_tick_params(labelleft=False)
+ax.set_xticks([])
+ax.set_yticks([])
+
+# add a note to show the colorscale is saturated
+note = (
+    "Note: Color scale is saturated to +/- 600 to improve interpretability"
+    "in OAs where the difference is small. Differences may lie outside these"
+    "limits"
+)
+ax.text(
+    0.01,
+    0.01,
+    note,
+    transform=ax.transAxes,
+    size=8,
+    wrap=True,
+    fontdict={"name": "Arial", "color": "#000000"},
+    va="bottom",
+    ha="left",
+)
+
+plt.show()
+# %%
+# open data and clip to the above geometry (2015 data)
+xds_sc_2015 = rioxarray.open_rasterio(
+    MERGED_SRC_DIR_2015, masked=True
+).rio.clip(geometries, from_disk=True, all_touched=True)
+
+# set the variable name of the data to be population
+xds_sc_2015.name = "population"
+
+# plot data and show resolution
+xds_sc_2015.plot()
+
+# %%
+# apply the clip_sum_overlap func to all OAs and calculate the difference
+oa_gdf["ghs_2015"] = oa_gdf.geometry.apply(
+    clip_sum_overlap, args=[xds_sc_2015]
+)
+
+# calulcate the diff with census
+oa_gdf["census_diff_2020"] = oa_gdf["census"] - oa_gdf["ghs"]
+oa_gdf["census_diff_2015"] = oa_gdf["census"] - oa_gdf["ghs_2015"]
+
+# %%
+# melt the df
+melt_cols = ["OA11CD", "pop_diff", "census_diff_2020", "census_diff_2015"]
+plot_df = oa_gdf[melt_cols].melt(
+    id_vars="OA11CD",
+    value_vars=["pop_diff", "census_diff_2020", "census_diff_2015"],
+    var_name="Estimate",
+    value_name="Difference",
+)
+
+fig, ax = plt.subplots(figsize=(16, 9))
+sns.boxplot(plot_df, x="Difference", y="Estimate", ax=ax, orient="h")
+
+ax.set_xlabel(
+    "Difference between ONS and GHS-POP Output Area Population Estimates"
+)
+
+ax.set_yticklabels(
+    [
+        "MYE 2020 vs GHS-POP 2020",
+        "Census 2011 vs GHS-POP 2020",
+        "Census 2011 vs GHS-POP 2015",
+    ]
+)
+ax.grid(axis="x")
+
+plt.show()
+
+# %%
+# build choropleth maps for MYE and census differences estimates
+fig, axes = plt.subplots(1, 3, figsize=(16, 9), facecolor=(0, 0, 0))
+
+# get consitent min and max to ensure consistent colorscales
+vmin = -600
+vmax = 600
+
+# build a title map
+title_map = {
+    "pop_diff": "MYE 2020 vs GHS-POP 2020",
+    "census_diff_2020": "Census 2011 vs GHS-POP 2020",
+    "census_diff_2015": "Census 2011 vs GHS-POP 2015",
+}
+
+# plot both estimates
+for ax, col in zip(axes.flat, title_map.keys()):
+    oa_gdf.to_crs("EPSG:4326").plot(
+        col, ax=ax, vmin=vmin, vmax=vmax, cmap="RdBu"
+    )
+    ax.axis("off")
+    ax.set_title(title_map[col], color="white")
+
+# make a common colorbar at the bottom of the figure
+fig = ax.get_figure()
+cax = fig.add_axes([0.2, 0.2, 0.6, 0.03])
+sm = plt.cm.ScalarMappable(
+    cmap="RdBu", norm=plt.Normalize(vmin=vmin, vmax=vmax)
+)
+cbar = fig.colorbar(sm, cax=cax, orientation="horizontal")
+cbar.ax.set_xlabel("OA Population Difference", color="White")
+cbar.ax.xaxis.set_tick_params(color="white")
+plt.setp(plt.getp(cbar.ax.axes, "xticklabels"), color="white")
+
+plt.tight_layout()
+plt.show()
+
+# %%
+# small snippet of code for developing unit tests
+
+
+def np_to_rioxarry(
+    arr: np.ndarray,
+    aff: rio.Affine,
+    as_type: str = "float32",
+    no_data: int = -200,
+    crs: str = "ESRI:54009",
+) -> xr.DataArray:
+    """Convert numpy array to rioxarry.
+
+    This function is only used within pytest, as a convinent way to build
+    fixtures without duplicating code.
+
+    Parameters
+    ----------
+    arr : np.ndarray
+        Input numpy array
+    aff : rio.Affine
+        Affine transform, for input data
+    as_type : str, optional
+        Data type, by default "int16"
+    no_data : int, optional
+        Value to use for no data, by default -200
+    crs : _type_, optional
+        Coordinate Reference system for input data, by default "ESRI:54009"
+
+    Returns
+    -------
+    xr.DataArray
+        Input numpy array as an xarray.DataArray with the correct
+
+    """
+    height = arr.shape[0]
+    width = arr.shape[1]
+    cols = np.arange(width)
+    rows = np.arange(height)
+
+    xs, ys = rio.transform.xy(aff, rows, cols)
+
+    x_array = (
+        xr.DataArray(arr.T, dims=["x", "y"], coords=dict(x=xs, y=ys))
+        .transpose("y", "x")
+        .astype(as_type)
+        .rio.write_nodata(no_data)
+        .rio.write_transform(aff)
+        .rio.set_crs(crs)
+    )
+
+    x_array.name = "var"
+
+    return x_array
+
+
+# %%
+# build a static, top left df
+array_1 = np.array(
+    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+)
+transform_1 = rio.Affine(100, 0, -225800, 0, -100, 6036800)
+xarray_1 = np_to_rioxarry(array_1, transform_1)
+gdf1 = vectorize(xarray_1)
+
+# %%
+# build a random, top right df
+array_2 = np.random.randn(4, 4)
+transform_2 = rio.Affine(100, 0, -225400, 0, -100, 6036800)
+xarray_2 = np_to_rioxarry(array_2, transform_2, as_type="float32")
+gdf2 = vectorize(xarray_2)
+
+# %%
+# build a random, bottom left df
+array_3 = np.random.randn(4, 4)
+transform_3 = rio.Affine(100, 0, -225800, 0, -100, 6036400)
+xarray_3 = np_to_rioxarry(array_3, transform_3, as_type="float32")
+gdf3 = vectorize(xarray_3)
+
+# %%
+# build a random, bottom right df
+array_4 = np.random.randn(4, 4)
+transform_4 = rio.Affine(100, 0, -225400, 0, -100, 6036400)
+xarray_4 = np_to_rioxarry(array_4, transform_4, as_type="float32")
+gdf4 = vectorize(xarray_4)
+
+# %%
+# plot all dummy gdfs
+m = gdf1.explore("var", legend=False)
+m = gdf2.explore("var", m=m, legend=False)
+m = gdf3.explore("var", m=m, legend=False)
+m = gdf4.explore("var", m=m, legend=False)
+m.fit_bounds(m.get_bounds())
+m
+
+# %%
+# display bounds
+for xarr in [xarray_1, xarray_2, xarray_3, xarray_4]:
+    print(xarr.rio.bounds())
+
+# %%
diff --git a/requirements.txt b/requirements.txt
index 26bdeffa..f0df1e24 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,15 @@ r5py>=0.0.4
 gtfs_kit==5.2.7
 pytest
 coverage
+rasterio
+pyprojroot
+matplotlib
+plotly
+nbformat>=4.2.0
+scikit-image
+cartopy
+folium
+geocube
+mapclassify
+seaborn
 -e .
diff --git a/src/transport_performance/utils/placeholder.py b/src/transport_performance/utils/placeholder.py
deleted file mode 100644
index eff9f4cd..00000000
--- a/src/transport_performance/utils/placeholder.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Placeholder file avoids codecov fail."""
diff --git a/src/transport_performance/utils/raster.py b/src/transport_performance/utils/raster.py
new file mode 100644
index 00000000..05bf3fc2
--- /dev/null
+++ b/src/transport_performance/utils/raster.py
@@ -0,0 +1,167 @@
+"""Utility functions to handle merging and resampling of raster data.
+
+These were developed to support merging raster files together (e.g., to cover
+a larger area), and resampling to a different grid size (e.g., 100x100m grids
+to 200x200m grids). The original design intention is for these to form part of
+gridded population data pre-processing.
+"""
+
+import os
+import glob
+import re
+import rioxarray
+
+from rioxarray.merge import merge_arrays
+from rasterio.warp import Resampling
+
+
+def merge_raster_files(
+    input_dir: str,
+    output_dir: str,
+    output_filename: str,
+    subset_regex: str = None,
+) -> dict:
+    """Merge raster files together.
+
+    Takes an input directory and merges all `.tif` files within it. Writes
+    merged raster to file.
+
+    Parameters
+    ----------
+    input_dir : str
+        Directory containing input raster files.
+    output_dir : str
+        Directory to write output, merged raster file.
+    output_filename : str
+        Filename of merged raster file (.tif extension required).
+    subset_regex : str, optional
+        Subset any raster files in the input directory using a regex, by
+        default None which means no subsetting will occur.
+
+    Returns
+    -------
+    bounds : dict
+        A dictionary summarising the boundaries of all input rasters and the
+        merged output. The "inputs" key is a list of the respective input
+        boundaries. The "output" key is a list containing the bounds of the
+        merged result. Useful to checking consistency of merged output.
+
+    Raises
+    ------
+    FileNotFoundError
+        In scenarios where `input_dir` does not exist and if there are no .tif
+        detectable within `input_dir`.
+
+    Notes
+    -----
+    1. This function does not provide any consistency checking of inputs and
+    merged outputs (e.g., checking for overlapping inputs and 'gaps' in the
+    merged outputs). This is primarily because merging is nuanced and numerous
+    in the potential ways inputs can be merged. For this reason, it is down to
+    the function's user to ensure the merged output is consistent with the
+    respective inputs. To this end, the bounds dict is returned to allow user
+    consistency testing.
+
+    2. The default rioxarry behaviours are assumed when merging inputs, i.e.,
+    the `CRS`, resolution and `nodata` values will be taken from the first
+    input DataArray. See [1]_ for more details.
+
+    References
+    ----------
+    .. [1] https://corteva.github.io/rioxarray/html/rioxarray.html#rioxarray.m
+    erge.merge_arrays
+
+    """
+    # defend against case where the provided input dir does not exist
+    if not os.path.exists(input_dir):
+        raise FileNotFoundError(f"{input_dir} can not be found")
+
+    # get tif files in directory, ensure some exist and select subset via regex
+    tif_filepaths = glob.glob(f"{input_dir}/*.tif")
+    if len(tif_filepaths) == 0:
+        raise FileNotFoundError(f"No `*.tif` files found in {input_dir}")
+
+    # apply regex and ensure tif files exist after applying it
+    # raise a unique FileNotFoundError to aid debugging
+    if subset_regex is not None:
+        tif_filepaths = [
+            fpath for fpath in tif_filepaths if re.search(subset_regex, fpath)
+        ]
+    if len(tif_filepaths) == 0:
+        raise FileNotFoundError(
+            f"No `*.tif` files found in {input_dir} after applying regex "
+            f"'{subset_regex}'."
+        )
+
+    # build a list of input rioxarrays to be merged
+    arrays = []
+    for tif_filepath in sorted(tif_filepaths):
+        arrays.append(rioxarray.open_rasterio(tif_filepath, masked=True))
+
+    # merge the datasets together
+    xds_merged = merge_arrays(arrays)
+
+    # make output_dir if it does not exist
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # create full filepath for merged tif file and write to disk
+    MERGED_DIR = os.path.join(output_dir, output_filename)
+    xds_merged.rio.to_raster(MERGED_DIR)
+
+    # get boundaries of inputs and output raster
+    bounds = {
+        "inputs": [array.rio.bounds() for array in arrays],
+        "output": [xds_merged.rio.bounds()],
+    }
+
+    return bounds
+
+
+def sum_resample_file(
+    input_filepath: str,
+    output_filepath: str,
+    resample_factor: int = 2,
+) -> None:
+    """Resample raster file (change grid resolution) by summing.
+
+    Takes an input raster file, resamples to a different grid resolution by
+    summing constituent cells, and writes the resampled raster to file.
+
+    Parameters
+    ----------
+    input_filepath : str
+        Input filpath of GeoTIFF file
+    output_filepath : str
+        Output filepath for resampled GeoTIFF file
+    resample_factor : int, optional
+        Factor to resample input raster by, by default 2 which means the
+        resolution will be decreased by 2 (e.g., input=100x100m then output=200
+        x200m)
+
+    Raises
+    ------
+    FileNotFoundError
+        Unable to find `input_filepath`
+
+    """
+    # defend against case where the provided input dir does not exist
+    if not os.path.exists(input_filepath):
+        raise FileNotFoundError(f"{input_filepath} can not be found")
+
+    xds = rioxarray.open_rasterio(input_filepath, masked=True)
+
+    # resample based on scaling factor and using sum resampling
+    xds_resampled = xds.rio.reproject(
+        xds.rio.crs,
+        resolution=tuple(
+            res * resample_factor for res in xds.rio.resolution()
+        ),
+        resampling=Resampling.sum,
+    )
+
+    # make output_filepath's directory if it does not exist
+    if not os.path.exists(os.path.dirname(output_filepath)):
+        os.mkdir(output_filepath)
+
+    xds_resampled.rio.to_raster(output_filepath)
diff --git a/tests/utils/test_raster.py b/tests/utils/test_raster.py
new file mode 100644
index 00000000..fdd950db
--- /dev/null
+++ b/tests/utils/test_raster.py
@@ -0,0 +1,295 @@
+"""Unit tests for transport_performance/utils/raster.py.
+
+Fixtures used in this file geographically represent a small area over ONS site
+in Newport. The data vales do not represent any real or meaningfull measurments
+, they are purely for use as a unit test and mimic input data used in this repo
+. By design these have been constructed such that the 4 grids touch without
+overlap. This allows effective unit testing of the `merge_raster_files`
+function by making assertions on the bounds of the inputs and merged output.
+"""
+
+import os
+import pytest
+import numpy as np
+import rasterio as rio
+import xarray as xr
+import rioxarray  # noqa: F401 - import required for xarray but not needed here
+
+from transport_performance.utils.raster import (
+    merge_raster_files,
+    sum_resample_file,
+)
+
+
+def np_to_rioxarry(
+    arr: np.ndarray,
+    aff: rio.Affine,
+    as_type: str = "float32",
+    no_data: int = -200,
+    crs: str = "ESRI:54009",
+) -> xr.DataArray:
+    """Convert numpy array to rioxarry.
+
+    This function is only used within pytest, as a convinent way to build
+    fixtures without duplicating code.
+
+    Parameters
+    ----------
+    arr : np.ndarray
+        Input numpy array
+    aff : rio.Affine
+        Affine transform, for input data
+    as_type : str, optional
+        Data type, by default "int16"
+    no_data : int, optional
+        Value to use for no data, by default -200
+    crs : _type_, optional
+        Coordinate Reference system for input data, by default "ESRI:54009"
+
+    Returns
+    -------
+    xr.DataArray
+        Input numpy array as an xarray.DataArray with the correct
+
+    """
+    # get geometry of input arr and generate col, row indcies
+    height = arr.shape[0]
+    width = arr.shape[1]
+    cols = np.arange(width)
+    rows = np.arange(height)
+
+    # transform indcies to x, y coordinates
+    xs, ys = rio.transform.xy(aff, rows, cols)
+
+    # build x_array object
+    x_array = (
+        xr.DataArray(arr.T, dims=["x", "y"], coords=dict(x=xs, y=ys))
+        .transpose("y", "x")
+        .astype(as_type)
+        .rio.write_nodata(no_data)
+        .rio.write_transform(aff)
+        .rio.set_crs(crs)
+    )
+
+    return x_array
+
+
+@pytest.fixture
+def merge_xarr_1():
+    """Create a dummay xarray.DataFrame for merge function testing.
+
+    Creates a xarray.DataFrame that forms the top left 4x4 raster.
+    """
+    array_1 = np.array(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
+    )
+    transform_1 = rio.Affine(100, 0, -225800, 0, -100, 6036800)
+    xarray_1 = np_to_rioxarry(array_1, transform_1)
+
+    return xarray_1
+
+
+@pytest.fixture
+def merge_xarr_2():
+    """Create a dummay xarray.DataFrame for merge function testing.
+
+    Creates a xarray.DataFrame that forms the top right 4x4 raster.
+    """
+    array_2 = np.random.randn(4, 4)
+    transform_2 = rio.Affine(100, 0, -225400, 0, -100, 6036800)
+    xarray_2 = np_to_rioxarry(array_2, transform_2, as_type="float32")
+
+    return xarray_2
+
+
+@pytest.fixture
+def merge_xarr_3():
+    """Create a dummay xarray.DataFrame for merge function testing.
+
+    Creates a xarray.DataFrame that forms the bottom left 4x4 raster.
+    """
+    array_3 = np.random.randn(4, 4)
+    transform_3 = rio.Affine(100, 0, -225800, 0, -100, 6036400)
+    xarray_3 = np_to_rioxarry(array_3, transform_3, as_type="float32")
+
+    return xarray_3
+
+
+@pytest.fixture
+def merge_xarr_4():
+    """Create a dummay xarray.DataFrame for merge function testing.
+
+    Creates a xarray.DataFrame that forms the bottom right 4x4 raster.
+    """
+    array_4 = np.random.randn(4, 4)
+    transform_4 = rio.Affine(100, 0, -225400, 0, -100, 6036400)
+    xarray_4 = np_to_rioxarry(array_4, transform_4, as_type="float32")
+
+    return xarray_4
+
+
+@pytest.fixture
+def merge_xarrs_fpath(
+    merge_xarr_1: xr.DataArray,
+    merge_xarr_2: xr.DataArray,
+    merge_xarr_3: xr.DataArray,
+    merge_xarr_4: xr.DataArray,
+    tmp_path: str,
+) -> str:
+    """Build temp directory containing dummy GeoTIFF files.
+
+    Used to replicate the behaviour of the `merge_raster_files` raster utils
+    function. Writes all the dummy GeoTIFFs to a folder within a temp
+    directory.
+
+    Parameters
+    ----------
+    merge_xarr_1 : xr.DataArray
+        Dummy input data from `merge_xarr_1` pytest fixture
+    merge_xarr_2 : xr.DataArray
+        Dummy input data from `merge_xarr_2` pytest fixture
+    merge_xarr_3 : xr.DataArray
+        Dummy input data from `merge_xarr_3` pytest fixture
+    merge_xarr_4 : xr.DataArray
+        Dummy input data from `merge_xarr_4` pytest fixture
+    tmp_path : str
+        Temporary directory to use for pytest run (a pytest object)
+
+    Returns
+    -------
+    str
+        Directory within the pytest temp dir in which the dummy GeoTIFF data
+        have been written
+
+    """
+    # create dir to write dummy data within the temp dir
+    write_dir = os.path.join(tmp_path, "merge_test_files")
+    os.mkdir(write_dir)
+
+    # write all dummy data to the dir within the temp dir
+    tiff_files = [merge_xarr_1, merge_xarr_2, merge_xarr_3, merge_xarr_4]
+    file_prefix = "test"
+    for i, xarr in enumerate(tiff_files):
+        out_filepath = os.path.join(write_dir, f"{file_prefix}_{i}.tif")
+        xarr.rio.to_raster(out_filepath)
+
+    return write_dir
+
+
+@pytest.fixture
+def resample_xarr_fpath(
+    merge_xarr_1: xr.DataArray,
+    tmp_path: str,
+) -> str:
+    """Build temp directory containing dummy GeoTIFF files to test resampling.
+
+    Used to replicate the behaviour of the `merge_raster_files` raster utils
+    function. Writes all the dummy GeoTIFFs to a folder within a temp
+    directory.
+
+    Parameters
+    ----------
+    merge_xarr_1 : xr.DataArray
+        Dummy input data from `merge_xarr_1` pytest fixture
+    tmp_path : str
+        Temporary directory to use for pytest run (a pytest object)
+
+    Returns
+    -------
+    str
+        Filepath to dummy GeoTIFF data
+
+    """
+    # create dir to write dummy data within the temp dir
+    write_dir = os.path.join(tmp_path, "resample_test_file")
+    os.mkdir(write_dir)
+
+    # write dummy data to the dir within the temp dir
+    out_filepath = os.path.join(write_dir, "input.tif")
+    merge_xarr_1.rio.to_raster(out_filepath)
+
+    return out_filepath
+
+
+class TestUtilsRaster:
+    """A class to test utils/raster functions."""
+
+    def test_merge_raster_files(self, merge_xarrs_fpath: str) -> None:
+        """Test `merge_raster_files`.
+
+        Parameters
+        ----------
+        merge_xarrs_fpath : str
+            Temporary dir containing the dummy GeoTIFF data. Output from the
+            merge_xarrs_fpath pytest fixture
+
+        """
+        # print fpath where input data resides in tmp folder
+        # useful when using -rP flag in pytest to see directory
+        print(f"Temp file path for merge tif inputs: {merge_xarrs_fpath}")
+
+        # create a regex to select tif - not needed, but tests the func arg
+        subset_regex = r".{0,}test_[0-9].{0,9}"
+
+        # build the output directory and file name
+        merge_out_dir = os.path.join(
+            os.path.dirname(merge_xarrs_fpath), "merge_test_output"
+        )
+        merge_out_filename = "merged.tif"
+
+        # run function to merge dummy inputs and get bounds
+        bounds = merge_raster_files(
+            merge_xarrs_fpath,
+            merge_out_dir,
+            merge_out_filename,
+            subset_regex=subset_regex,
+        )
+
+        # ensure no gaps between merged inputs
+        assert bounds["inputs"][0][2] == bounds["inputs"][1][0]
+        assert bounds["inputs"][0][1] == bounds["inputs"][2][3]
+        assert bounds["inputs"][2][2] == bounds["inputs"][3][0]
+        assert bounds["inputs"][3][3] == bounds["inputs"][1][1]
+
+        # ensure the bounds of the output is the extrema of all the inputs
+        minx = min([coords[0] for coords in bounds["inputs"]])
+        miny = min([coords[1] for coords in bounds["inputs"]])
+        maxx = max([coords[2] for coords in bounds["inputs"]])
+        maxy = max([coords[3] for coords in bounds["inputs"]])
+        assert bounds["output"][0] == (minx, miny, maxx, maxy)
+
+    def test_sum_resample_file(self, resample_xarr_fpath: str) -> None:
+        """Test `sum_resample_file`.
+
+        Use `merge_xarr_1` to test resampling, which will have a known and
+        expected result.
+
+        Parameters
+        ----------
+        resample_xarr_fpath : str
+            Filepath to dummy data.
+
+        """
+        # print fpath where input data resides in tmp folder
+        # useful when using -rP flag in pytest to see directory
+        print(f"Temp filepath for resampling test: {resample_xarr_fpath}")
+
+        # resample to input and set the output location
+        output_fpath = os.path.join(
+            os.path.dirname(resample_xarr_fpath), "output.tif"
+        )
+        sum_resample_file(resample_xarr_fpath, output_fpath)
+
+        # re-read in resampled output to test
+        xds_out = rioxarray.open_rasterio(output_fpath)
+
+        # assert expected resolution, shape, no data value and crs
+        assert xds_out.rio.transform().a == 200
+        assert xds_out.rio.transform().e == -200
+        assert xds_out.to_numpy().shape == (1, 2, 2)
+        assert xds_out.rio.nodata == -200.0
+        assert xds_out.rio.crs.to_string() == "ESRI:54009"
+
+        # assert correct resampling values (summing consitiuent grids)
+        expected_result = np.array([[[14, 22], [46, 54]]])
+        assert np.array_equal(expected_result, xds_out.to_numpy())