diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 32eaeecf..538a009b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,11 +9,11 @@ jobs: max-parallel: 5 steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.6 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python 3.10.6 + uses: actions/setup-python@v4 with: - python-version: 3.6 + python-version: 3.10.6 - name: Add conda to system path run: | # $CONDA is an environment variable pointing to the root of the miniconda directory diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..e280fe5d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,36 @@ + +name: Publish deafrica-waterbodies to PyPI +on: + push: + branches: + - main + paths: + - 'deafrica_waterbodies/**' + + workflow_dispatch: + +jobs: + build-n-publish: + name: Build and publish deafrica-waterbodies to PyPI + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./deafrica_waterbodies + + steps: + - name: Checkout the digitalearthafrica/deafrica-waterbodiesrepository + uses: actions/checkout@v3 + - name: Set up Python 3.10.6 + uses: actions/setup-python@v4 + with: + python-version: "3.10.6" # Version range or exact version of a Python version to use, using SemVer's version range syntax + architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified. + - name: Install pypa/build + run: python3 -m pip install build --user + - name: Build a binary wheel and a source tarball + run: python3 -m build --sdist --wheel --outdir dist/ . + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + packages-dir: deafrica_waterbodies/dist/ \ No newline at end of file diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 8d5e263f..8d76a8c9 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -1,11 +1,12 @@ name: Push on: + workflow_dispatch: push: branches: - stable paths: - - 'dea_waterbodies/**' + - 'deafrica_waterbodies/**' - '.github/workflows/push.yml' - 'Dockerfile' @@ -13,14 +14,14 @@ on: types: [created, edited] env: - IMAGE_NAME: geoscienceaustralia/dea-waterbodies + IMAGE_NAME: digitalearthafrica/deafrica-waterbodies jobs: push: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -29,18 +30,18 @@ jobs: run: | echo "RELEASE=${GITHUB_REF/refs\/tags\/}" >> $GITHUB_ENV - name: Build and Push semver tagged Docker Image for Release - uses: whoan/docker-build-with-cache-action@v4 + uses: whoan/docker-build-with-cache-action@v6 if: github.event_name == 'release' with: image_name: ${{ env.IMAGE_NAME }} - username: ${{ secrets.DOCKER_USER }} - password: ${{ secrets.DOCKER_PASS }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DEAFRICA_DOCKER_PASSWORD}} image_tag: ${{ env.RELEASE }} - name: Run Trivy vulnerability scanner for Release uses: aquasecurity/trivy-action@master if: github.event_name == 'release' with: - image-ref: 'docker.io/geoscienceaustralia/dea-waterbodies:${{ env.RELEASE }}' + image-ref: 'docker.io/digitalearthafrica/deafrica-waterbodies:${{ env.RELEASE }}' format: 'table' exit-code: '1' ignore-unfixed: true @@ -53,18 +54,18 @@ jobs: git fetch --all --tags echo "RELEASE=$(git describe --tags)" >> $GITHUB_ENV - name: Build and Push unstable Docker Image for push to main - uses: whoan/docker-build-with-cache-action@v4 + uses: whoan/docker-build-with-cache-action@v6 if: github.event_name != 'release' with: image_name: ${{ env.IMAGE_NAME }} - username: ${{ secrets.DOCKER_USER }} - password: ${{ secrets.DOCKER_PASS }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DEAFRICA_DOCKER_PASSWORD}} image_tag: latest,${{ env.RELEASE }} - name: Run Trivy vulnerability scanner for push to main uses: aquasecurity/trivy-action@master if: github.event_name != 'release' with: - image-ref: 'docker.io/geoscienceaustralia/dea-waterbodies:${{ env.RELEASE }}' + image-ref: 'docker.io/digitalearthafrica/deafrica-waterbodies:${{ env.RELEASE }}' format: 'table' exit-code: '1' ignore-unfixed: true diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ecd199f7..f47c7e8b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,10 +3,9 @@ name: Test on: [push] env: - ORG: geoscienceaustralia - IMAGE: dea-waterbodies - METADATA_CATALOG: https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/a4f39b485b33608a016032d9987251881fec4b6f/workspaces/sandbox-metadata.yaml - PRODUCT_CATALOG: https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/a4f39b485b33608a016032d9987251881fec4b6f/workspaces/sandbox-products.csv + ORG: digitalearthafrica + IMAGE: deafrica-waterbodies + PRODUCT_CATALOG: https://raw.githubusercontent.com/digitalearthafrica/config/master/prod/products_prod.csv jobs: test: @@ -14,26 +13,23 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 - - - name: Pre-pull layers - run: docker-compose pull - + uses: actions/checkout@v4.0.0 - name: Activate Docker cache - uses: satackey/action-docker-layer-caching@v0.0.8 + uses: satackey/action-docker-layer-caching@v0.0.11 # Ignore the failure of a step and avoid terminating the job. continue-on-error: true - - name: Build dea-waterbodies image + - name: Setup deafrica-waterbodies test environment run: | - docker-compose build + make test-env - - name: Test dea-waterbodies image + - name: Run deafrica-waterbodies test run: | - docker-compose up -d - ./setup_test_datacube.sh - docker-compose exec -T waterbodies bash -c "pytest ." - docker-compose down - + make run-tests + make clean + - name: Clean deafrica-waterbodies image + run: | + (echo y) | docker container prune + (echo y) | docker image prune \ No newline at end of file diff --git a/.gitignore b/.gitignore index 25816d14..0d1adcb1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,13 +11,6 @@ watercourses *.shp *.shx -# Explicitly allow test data -!tests/data/*.cpg -!tests/data/*.dbf -!tests/data/*.prj -!tests/data/*.shp -!tests/data/*.shx - # graphs *.graphml *.gml @@ -26,3 +19,6 @@ watercourses *.csv *.geojson *.json + +# Explicitly allow test data +!tests/data/*.geojson \ No newline at end of file diff --git a/Makefile b/Makefile index 2e118654..f83b47e0 100644 --- a/Makefile +++ b/Makefile @@ -49,9 +49,6 @@ run-tests: docker compose exec -T waterbodies bash -c "coverage xml" docker compose exec -T waterbodies bash -c "coverage html" -test: - docker-compose exec waterbodies pytest tests - down: ## Bring down the system docker compose down @@ -71,4 +68,5 @@ pip_compile: requirements.in lint: - docker-compose exec waterbodies black --check dea_waterbodies + docker-compose exec waterbodies black --check deafrica_waterbodies + docker-compose exec waterbodies isort --check deafrica_waterbodies \ No newline at end of file diff --git a/README.rst b/README.rst index 2a01d9f7..db1367a1 100644 --- a/README.rst +++ b/README.rst @@ -1,64 +1,63 @@ -.. image:: figures/dea_logo_wide.jpg +.. image:: figures/deafrica_logo_wide.jpg :width: 900 - :alt: Digital Earth Australia logo + :alt: Digital Earth Africa logo -Digital Earth Australia Waterbodies +Digital Earth Africa Waterbodies ################################### .. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg :target: https://opensource.org/licenses/Apache-2.0 - :alt: Digital Earth Australia logo + :alt: Digital Earth Africa logo -.. image:: https://github.com/GeoscienceAustralia/dea-waterbodies/actions/workflows/lint.yml/badge.svg - :target: https://github.com/GeoscienceAustralia/dea-waterbodies/actions/workflows/lint.yml +.. image:: https://github.com/digitalearthafrica/deafrica-waterbodies/actions/workflows/lint.yml/badge.svg + :target: https://github.com/digitalearthafrica/deafrica-waterbodies/actions/workflows/lint.yml :alt: Linting status -.. image:: https://github.com/GeoscienceAustralia/dea-waterbodies/actions/workflows/test.yml/badge.svg - :target: https://github.com/GeoscienceAustralia/dea-waterbodies/actions/workflows/test.yml +.. image:: https://github.com/digitalearthafrica/deafrica-waterbodies/actions/workflows/test.yml/badge.svg + :target: https://github.com/digitalearthafrica/deafrica-waterbodies/actions/workflows/test.yml :alt: Testing status -**License:** The code in this repository is licensed under the `Apache License, Version 2.0 `_. Digital Earth Australia data is licensed under the `Creative Commons by Attribution 4.0 license `_. +**License:** The code in this repository is licensed under the `Apache License, Version 2.0 `_. Digital Earth Africa data is licensed under the `Creative Commons by Attribution 4.0 license `_. -**Contact:** If you need assistance with any of the Jupyter Notebooks or Python code in this repository, please post a question on the `Open Data Cube Slack channel `_. If you would like to report an issue with this repo, or suggest feature requests, you can `open an issue on this repository `_. Non-technical questions about Digital Earth Australia Waterbodies can be sent to dea@ga.gov.au. +**Contact:** If you need assistance with any of the Jupyter Notebooks or Python code in this repository, please post a question on the `Open Data Cube Slack channel `_. If you would like to report an issue with this repo, or suggest feature requests, you can `open an issue on this repository `_. Non-technical questions about Digital Earth Africa Waterbodies can be sent to info@digitalearthafrica.org. -**Citing Digital Earth Australia Waterbodies:** +**Citing Digital Earth Africa Waterbodies:** Krause, Claire E.; Newey, Vanessa; Alger, Matthew J.; Lymburner, Leo. 2021. "Mapping and Monitoring the Multi-Decadal Dynamics of Australia’s Open Waterbodies Using Landsat" Remote Sens. 13, no. 8: 1437. https://doi.org/10.3390/rs13081437 ---------- -Up to date information about the extent and location of surface water provides all Australians with a common understanding of this valuable and increasingly scarce resource. Water detection algorithms are now being routinely applied to continental and global archives of satellite imagery. However, water resource management decisions typically take place at the waterbody rather than pixel scale. +Up to date information about the extent and location of surface water across Africa provides stakeholders with a common understanding of this valuable and increasingly scarce resource. Water detection algorithms are now being routinely applied to continental and global archives of satellite imagery. However, water resource management decisions typically take place at the waterbody rather than pixel scale. -This repository presents a workflow for generating polygons of persistent waterbodies from Landsat observations, enabling improved monitoring and management of water assets across Australia. We use `Digital Earth Australia’s (DEA) Water Observations from Space (WOfS) water classifier `_, which provides a water classified output for every available Landsat scene, to determine the spatial locations and extents of waterbodies across Australia. DEA Waterbodies uses Geoscience Australia’s archive of over 30 years of Landsat satellite imagery to identify where almost 300,000 waterbodies are in the Australian landscape. +This repository presents a workflow for generating polygons of persistent waterbodies from Landsat observations, enabling improved monitoring and management of water assets across Africa. We use `Digital Earth Africa’s (DE Africa) Water Observations from Space (WOfS) water classifier `_, which provides a water classified output for every available Landsat scene, to determine the spatial locations and extents of waterbodies across Africa. DE Africa Waterbodies uses Digital Earth Africa’s archive of over 30 years of Landsat satellite imagery to identify where over 700,000 waterbodies are in the African landscape. .. image:: figures/WorkflowDiagram.JPG :width: 900 - :alt: Digital Earth Australia Waterbodies workflow diagram + :alt: Digital Earth Africa Waterbodies workflow diagram -*Digital Earth Australia Waterbodies workflow* +*Digital Earth Africa Waterbodies workflow* -Each polygon was then used to generate a time series of WOfS, providing a history of the change in the wet surface area of each waterbody every ~16 days since 1987. +Each polygon was then used to generate a time series of WOfS, providing a history of the change in the wet surface area of each waterbody every ~16 days since 1984. -.. image:: figures/DEAWaterbodiesESRIBasemap.jpeg +.. image:: figures/DEAfricaWaterbodiesESRIBasemap.png :width: 900 :alt: Digital Earth Australia Waterbodies -*Digital Earth Australia Waterbodies. Waterbody polygons mapped by this product are shown in blue. There are almost 300,000 across Australia.* +*Digital Earth Africa Waterbodies. Waterbody polygons mapped by this product are shown in blue. There are over 700,000 across Africa.* -DEA Waterbodies supports users to understand and manage water across Australia. DEA Waterbodies provides new insights into local through to national-scale surface water spatio-temporal dynamics by enabling the monitoring of important landscape features such as lakes and dams, improving our ability to use earth observation data to make meaningful decisions. It can be used to gain insights into the severity and spatial distribution of drought, or identify potential water sources for aerial firefighting during bushfires. - -For more information about the DEA Waterbodies product, including instructions for accessing the product, frequently asked questions and data download links, see the `Digital Earth Australia website `_. +DE Africa Waterbodies supports users to understand and manage water across Africa. DE Africa Waterbodies provides new insights into local through to continental-scale surface water spatio-temporal dynamics by enabling the monitoring of important landscape features such as lakes and dams, improving our ability to use earth observation data to make meaningful decisions. It can be used to gain insights into the severity and spatial distribution of drought, or identify potential water sources. +For more information about the DE Africa Waterbodies product, including instructions for accessing the product, frequently asked questions and data download links, see the `Digital Earth Africa Data Catalogue `_. Installation ------------ -DEA Waterbodies has some requirements which can be installed with pip: +DE Africa Waterbodies has some requirements which can be installed with pip: .. code-block:: bash pip install --extra-index-url="https://packages.dea.ga.gov.au" -r requirements.txt -Once you have installed the requirements for DEA Waterbodies, install the module locally: +Once you have installed the requirements for DE Africa Waterbodies, install the module locally: .. code-block:: bash @@ -70,4 +69,4 @@ A command line interface is available for generating wet area time series for a .. code-block:: bash - waterbodies-ts --help + deafrica-waterbodies --help diff --git a/deafrica_waterbodies/attributes.py b/deafrica_waterbodies/attributes.py index 6393aa0f..d882fb96 100644 --- a/deafrica_waterbodies/attributes.py +++ b/deafrica_waterbodies/attributes.py @@ -1,8 +1,17 @@ +import logging +import os +from urllib.parse import urlparse + import geohash as gh import geopandas as gpd +from shapely import Point, Polygon + +from deafrica_waterbodies.io import check_if_s3_uri + +_log = logging.getLogger(__name__) -def assign_unique_ids(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: +def assign_unique_ids(polygons: gpd.GeoDataFrame, precision: int = 10) -> gpd.GeoDataFrame: """ Function to assign a unique ID to each waterbody polygon. @@ -10,7 +19,9 @@ def assign_unique_ids(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: ---------- polygons : gpd.GeoDataFrame GeoDataFrame containing the waterbody polygons. - + precision : int + Precision to use when encoding a polygon's centroid using geohash to + generate the polygon's unique identifier. Returns ------- gpd.GeoDataFrame @@ -27,7 +38,8 @@ def assign_unique_ids(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: # Generate a unique id for each polygon. polygons_with_unique_ids = polygons.to_crs(epsg=4326) polygons_with_unique_ids["UID"] = polygons_with_unique_ids.apply( - lambda x: gh.encode(x.geometry.centroid.y, x.geometry.centroid.x, precision=9), axis=1 + lambda x: gh.encode(x.geometry.centroid.y, x.geometry.centroid.x, precision=precision), + axis=1, ) # Check that our unique ID is in fact unique @@ -47,82 +59,158 @@ def assign_unique_ids(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return polygons_with_unique_ids_sorted -def get_timeseries_s3_object_url( +def get_timeseries_s3_url( uid: str, - product_version: str, - timeseries_bucket: str, + bucket_name: str, + region_code: str, + object_prefix: str, ) -> str: """ - Get the timeseries s3 object URL given a unique identifier for a polygon. + Get the timeseries S3 Object URL given the unique identifier for a polygon. Parameters ---------- uid : str Unique identifier - product_version : str - The product version for the DE Africa Waterbodies service. - timeseries_bucket : str - The s3 bucket for the DE Africa Waterbodies service timeseries. + bucket_name : str + The S3 bucket containing the timeseries csv files. + region_code: + The location of the S3 bucket specified by `bucket_name`. + object_prefix: + The folder on S3 containing the timeseries csv files. Returns ------- str - A s3 object URL for the timeseries for a waterbody polygon. + A S3 Object URL for the timeseries for a waterbody polygon. """ + subfolder = uid[:4] + csv_file = f"{uid}.csv" - # Incase storage location is local. - if timeseries_bucket is None: - timeseries_bucket == "deafrica-waterbodies-dev" + # Construct the S3 Object URL + timeseries_s3_object_url = f"https://{bucket_name}.s3.{region_code}.amazonaws.com/{object_prefix}/{subfolder}/{csv_file}" - version = product_version.replace(".", "-") + return timeseries_s3_object_url - subfolder = uid[:4] - csv_file = f"{uid}_v{version[0]}.csv" +def get_timeseries_fp( + uid: str, + timeseries_directory: str, +) -> str: + """ + Get the timeseries file path given the unique identifier for a polygon. - timeseries_s3_object_url = f"https://{timeseries_bucket}.s3.af-south-1.amazonaws.com/{version}/timeseries/{subfolder}/{csv_file}" + Parameters + ---------- + uid : str + Polygon unique identifier + timeseries_directory : str + The directory containing the DE Africa Waterbodies timeseries csv files. - return timeseries_s3_object_url + Returns + ------- + str + A file path for the timeseries for a waterbody polygon. + """ + subfolder = uid[:4] + csv_file = f"{uid}.csv" + + # Construct the file path + timeseries_fp = os.path.join(timeseries_directory, subfolder, csv_file) + + return timeseries_fp def add_timeseries_attribute( - polygons: gpd.GeoDataFrame, product_version: str, timeseries_bucket: str + polygons: gpd.GeoDataFrame, + timeseries_directory: str, + region_code: str = "af-south-1", ) -> gpd.GeoDataFrame: """ - Function to assign the s3 object URL for the timeseries for each waterbody polygon. + Function to assign a file path or S3 Object URL for the timeseries for each waterbody polygon. Parameters ---------- polygons : gpd.GeoDataFrame GeoDataFrame containing the waterbody polygons. - product_version : str - The product version for the DE Africa Waterbodies service. - timeseries_bucket : str - The s3 bucket for the DE Africa Waterbodies service timeseries. + timeseries_directory : str + The directory containing the DE Africa Waterbodies timeseries csv files. + region_code: str + This is the location of the bucket if `timeseries_dir` is a S3 URI. Returns ------- gpd.GeoDataFrame GeoDataFrame containing the waterbody polygons with an additional column "timeseries". - The "timeseries" column contains the s3 object URL for the timeseries for each + The "timeseries" column contains the file path or S3 Object URL for the timeseries for each of the waterbody polygons. """ + if check_if_s3_uri(timeseries_directory): + # Parse the S3 URI. + parsed = urlparse(timeseries_directory, allow_fragments=False) + bucket_name = parsed.netloc + object_prefix = parsed.path.strip("/") + + polygons["timeseries"] = polygons.apply( + lambda row: get_timeseries_s3_url( + uid=row["UID"], + bucket_name=bucket_name, + region_code=region_code, + object_prefix=object_prefix, + ), + axis=1, + ) + else: + polygons["timeseries"] = polygons.apply( + lambda row: get_timeseries_fp( + uid=row["UID"], + timeseries_directory=timeseries_directory, + ), + axis=1, + ) - polygons["timeseries"] = polygons.apply( - lambda row: get_timeseries_s3_object_url( - uid=row["UID"], - product_version=product_version, - timeseries_bucket=timeseries_bucket, - ), - axis=1, - ) return polygons -def add_area_and_perimeter_attributes(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: +def get_polygon_length(poly: Polygon) -> float: """ - Function to add the area and perimeter for each waterbody polygon. + Calculate the length of a polygon. + + Parameters + ---------- + poly : Polygon + Polygon to get length for. + + Returns + ------- + float + Length of polygon i.e. longest edge of the mminimum bounding of the polygon. + """ + # Calculate the minimum bounding box (oriented rectangle) of the polygon + min_bbox = poly.minimum_rotated_rectangle + + # Get the coordinates of polygon vertices. + x, y = min_bbox.exterior.coords.xy + + # Get the length of bounding box edges + edge_length = ( + Point(x[0], y[0]).distance(Point(x[1], y[1])), + Point(x[1], y[1]).distance(Point(x[2], y[2])), + ) + + # Get the length of polygon as the longest edge of the bounding box. + length = max(edge_length) + + # Get width of the polygon as the shortest edge of the bounding box. + # width = min(edge_length) + + return length + + +def add_polygon_properties(polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Function to add the area, perimeter and length for each waterbody polygon. Parameters ---------- @@ -133,16 +221,21 @@ def add_area_and_perimeter_attributes(polygons: gpd.GeoDataFrame) -> gpd.GeoData ------- gpd.GeoDataFrame GeoDataFrame with the crs "EPSG:6933" containing the waterbody polygons - with additional columns "area_m2" and "perim_m". + with additional columns "area_m2", "perim_m" and "length_m". The "area_m2" column contains the area in meters squared of each waterbody polygon calculated in the crs "EPS:6933". The "perim_m" column contains the perimeter in meters of each waterbody polygon calculated in the crs "EPS:6933". + The "length_m" column contains the major axis length in meters of each + waterbody polygon calculated in the crs "EPS:6933". """ # Reproject into a projected crs polygons_6933 = polygons.to_crs("EPSG:6933") + # Get the major axis length of each polygon. + polygons_6933["length_m"] = polygons_6933["geometry"].apply(get_polygon_length) + # Perimeter polygons_6933["perim_m"] = polygons_6933.geometry.length polygons_6933["perim_m"] = polygons_6933["perim_m"].round(decimals=4) diff --git a/deafrica_waterbodies/cli/filter_waterbody_polygons.py b/deafrica_waterbodies/cli/filter_waterbody_polygons.py deleted file mode 100644 index 8b218587..00000000 --- a/deafrica_waterbodies/cli/filter_waterbody_polygons.py +++ /dev/null @@ -1,110 +0,0 @@ -import logging -import math -import os - -import click -import geopandas as gpd - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.filters import filter_waterbodies - - -@click.command("filter-waterbody-polygons", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--output-directory", - type=click.Path(), - help="Directory containing the waterbody polygons.", -) -@click.option( - "--min-polygon-size", - default=4500, - show_default=True, - help="Minimum area in m2 of the waterbody polygons to be included.", -) -@click.option( - "--max-polygon-size", - default=math.inf, - show_default=True, - help="Maximum area in m2 of the waterbody polygons to be included.", -) -@click.option( - "--land-sea-mask-fp", - default="", - help="File path to vector dataset to use to filter out ocean polygons.", -) -@click.option( - "--urban-mask-fp", - type=click.Path(), - default="", - help="File path to vector dataset to use to filter out urban/CBD areas.", -) -@click.option( - "--major-rivers-mask-fp", - type=click.Path(), - default="", - help="File path to vector dataset to use to filter out major rivers.", -) -@click.option( - "--handle-large-polygons", - default="nothing", - type=click.Choice(["erode-dilate-v1", "erode-dilate-v2", "nothing"]), - show_default=True, - help="Method to use to split large polygons above the Polsby–Popper test value.", -) -@click.option( - "--pp-test-threshold", - default=0.005, - show_default=True, - help="Polsby–Popper test value to use to split large polygons.", -) -def filter_waterbody_polygons( - verbose, - output_directory, - min_polygon_size, - max_polygon_size, - land_sea_mask_fp, - urban_mask_fp, - major_rivers_mask_fp, - handle_large_polygons, - pp_test_threshold, -): - """ - Filter the primary and secondary threshold waterbody polygons. - """ - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib paths. - output_directory = str(output_directory) - - _log.info("Loading primary and secondary threshold polygons...") - - primary_threshold_polygons_fp = os.path.join( - output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet" - ) - secondary_threshold_polygons_fp = os.path.join( - output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet" - ) - primary_threshold_polygons = gpd.read_parquet(primary_threshold_polygons_fp) - secondary_threshold_polygons = gpd.read_parquet(secondary_threshold_polygons_fp) - - _log.info(f"Primary threshold polygons count {len(primary_threshold_polygons)}.") - _log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons)}.") - - filtered_polygons = filter_waterbodies( - primary_threshold_polygons=primary_threshold_polygons, - secondary_threshold_polygons=secondary_threshold_polygons, - min_polygon_size=min_polygon_size, - max_polygon_size=max_polygon_size, - land_sea_mask_fp=land_sea_mask_fp, - urban_mask_fp=urban_mask_fp, - major_rivers_mask_fp=major_rivers_mask_fp, - handle_large_polygons=handle_large_polygons, - pp_test_threshold=pp_test_threshold, - ) - - filtered_polygons_fp = os.path.join(output_directory, "filtered_polygons.parquet") - filtered_polygons.to_parquet(filtered_polygons_fp) - _log.info(f"Filtered waterbody polygons written to {filtered_polygons_fp}") diff --git a/deafrica_waterbodies/cli/generate_polygons.py b/deafrica_waterbodies/cli/generate_polygons.py new file mode 100644 index 00000000..975c5d64 --- /dev/null +++ b/deafrica_waterbodies/cli/generate_polygons.py @@ -0,0 +1,363 @@ +import logging +import math +import os +from importlib import import_module + +import click +import fsspec +import geopandas as gpd +import pandas as pd + +from deafrica_waterbodies.attributes import ( + add_polygon_properties, + add_timeseries_attribute, + assign_unique_ids, +) +from deafrica_waterbodies.cli.logs import logging_setup +from deafrica_waterbodies.filters import filter_by_area, filter_by_length +from deafrica_waterbodies.group_polygons import split_polygons_by_region +from deafrica_waterbodies.io import ( + check_dir_exists, + check_file_exists, + check_if_s3_uri, + find_parquet_files, + write_waterbodies_to_file, +) +from deafrica_waterbodies.make_polygons import ( + merge_polygons_at_tile_boundaries, + process_raster_polygons, + set_wetness_thresholds, +) +from deafrica_waterbodies.plugins.utils import run_plugin, validate_plugin +from deafrica_waterbodies.tiling import get_wofs_ls_summary_alltime_tiles + + +@click.command( + "generate-polygons", + no_args_is_help=True, +) +@click.option("-v", "--verbose", count=True) +@click.option( + "--aoi-vector-file", default=None, type=str, help="Vector file defining the area interest." +) +@click.option( + "--tile-size-factor", + default=4, + type=float, + help="Factor by which to increase/decrease the WOfS All Time Summary product tiles/regions.", +) +@click.option( + "--num-workers", + default=8, + type=int, + help="Number of worker processes to use when filtering WOfS All Time Summary product tiles", +) +@click.option( + "--min-valid-observations", + default=60, + type=int, + help="Minimum number of observations for a pixel to be considered valid.", + show_default=True, +) +@click.option( + "--detection-threshold", + default=0.1, + type=float, + help="Threshold to define the location of the water body polygons.", + show_default=True, +) +@click.option( + "--extent-threshold", + default=0.05, + type=float, + help="Threshold to define the extent/shape of the water body polygons.", + show_default=True, +) +@click.option( + "--land-sea-mask-fp", + default="", + help="File path to vector/raster dataset to use to filter out ocean polygons.", +) +@click.option( + "--raster-processing-plugin-name", + default=None, + type=str, + help="Name of the plugin containing the filtering functions to use in the raster processing space." + "Plugin file must be in the deafrica_waterbodies/plugins/ directory.", +) +@click.option( + "--overwrite/--no-overwrite", + default=False, + help="Rerun tiles that have already been processed.", +) +@click.option( + "--min-polygon-size", + default=4500, + show_default=True, + help="Minimum area in m2 of the waterbody polygons to be included.", +) +@click.option( + "--max-polygon-size", + default=math.inf, + show_default=True, + help="Maximum area in m2 of the waterbody polygons to be included.", +) +@click.option( + "--output-directory", + type=str, + help="Directory to write the water body polygons to.", +) +@click.option( + "--timeseries-directory", + type=str, + help="The path to the directory containing the timeseries for the polygons.", +) +@click.option( + "--file-name-prefix", + default="waterbodies", + type=str, + help="File name for the final output", +) +@click.option( + "--group-by-wofs-ls-regions/--not-group-by-wofs-ls-regions", + default=True, + help="Group waterbody polygons by wofs_ls regions.", +) +@click.option( + "--length-threshold-km", + default=150, + show_default=True, + help="Length threshold in kilometers by which to filter out large polygons before grouping polygons by wofs_ls region.", +) +def generate_polygons( + verbose, + aoi_vector_file, + tile_size_factor, + num_workers, + min_valid_observations, + detection_threshold, + extent_threshold, + land_sea_mask_fp, + raster_processing_plugin_name, + overwrite, + min_polygon_size, + max_polygon_size, + output_directory, + timeseries_directory, + file_name_prefix, + group_by_wofs_ls_regions, + length_threshold_km, +): + """ + Generate water body polygons from WOfS All Time Summary data + """ + # Set up logger. + logging_setup(verbose=verbose) + _log = logging.getLogger(__name__) + + # Parameters to use when loading datasetspolygons_split_by_region_dir. + # Chunks selected based on size of WOfs scene. + dask_chunks = {"x": 3200, "y": 3200, "time": 1} + + # Support pathlib Paths. + if aoi_vector_file is not None: + aoi_vector_file = str(aoi_vector_file) + + output_directory = str(output_directory) + + # Directory to write outputs from intermediate steps + intermediate_outputs_dir = os.path.join(output_directory, "intermediate_outputs") + # Directory to write generated first set of waterbody polygons to. + polygons_from_thresholds_dir = os.path.join( + intermediate_outputs_dir, "polygons_from_thresholds" + ) + # Directory to write final output. + final_outputs_dir = os.path.join(output_directory, "historical_extent") + # Directory to store polygons split by region. + polygons_split_by_region_dir = os.path.join( + output_directory, "historical_extent_split_by_wofs_region" + ) + + # Set the filesystem to use. + if check_if_s3_uri(output_directory): + fs = fsspec.filesystem("s3") + else: + fs = fsspec.filesystem("file") + + if not check_dir_exists(intermediate_outputs_dir): + fs.mkdirs(intermediate_outputs_dir, exist_ok=True) + _log.info(f"Created directory {intermediate_outputs_dir}") + + if not check_dir_exists(polygons_from_thresholds_dir): + fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True) + _log.info(f"Created directory {polygons_from_thresholds_dir}") + + if not check_dir_exists(final_outputs_dir): + fs.mkdirs(final_outputs_dir, exist_ok=True) + _log.info(f"Created directory {final_outputs_dir}") + + if group_by_wofs_ls_regions: + if not check_dir_exists(polygons_split_by_region_dir): + fs.mkdirs(polygons_split_by_region_dir, exist_ok=True) + _log.info(f"Created directory {polygons_split_by_region_dir}") + + # Load the area of interest as a GeoDataFrame. + if aoi_vector_file is not None: + try: + aoi_gdf = gpd.read_file(aoi_vector_file) + except Exception as error: + _log.exception(f"Could not read the file {aoi_vector_file}") + raise error + else: + aoi_gdf = None + + # Get the tiles fo the wofs_ls_summary_alltime product. + tiles, grid_workflow = get_wofs_ls_summary_alltime_tiles( + aoi_gdf=aoi_gdf, tile_size_factor=tile_size_factor, num_workers=num_workers + ) + + # Set the wetness thresholds. + min_wet_thresholds = set_wetness_thresholds( + detection_threshold=detection_threshold, extent_threshold=extent_threshold + ) + + # Set filters to apply during raster processing. + if raster_processing_plugin_name is not None: + # Read the plugin as a Python module. + module = import_module(f"deafrica_waterbodies.plugins.{raster_processing_plugin_name}") + plugin_file = module.__file__ + plugin = run_plugin(plugin_file) + _log.info(f"Using plugin {plugin_file}") + validate_plugin(plugin) + else: + plugin = None + + # Generate the first set of polygons for each of the tiles. + for tile in tiles.items(): + tile_id = tile[0] + raster_polygons_fp = os.path.join( + polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_raster_polygons.parquet" + ) + + if not overwrite: + _log.info(f"Checking existence of {raster_polygons_fp}") + exists = check_file_exists(raster_polygons_fp) + if exists: + _log.info( + f"{raster_polygons_fp} exists! \n Skipping generating water body polygons for {tile_id}." + ) + + if overwrite or not exists: + try: + _log.info(f"Generating water body polygons for tile {tile_id}.") + raster_polygons = process_raster_polygons( + tile=tile, + grid_workflow=grid_workflow, + plugin=plugin, + dask_chunks=dask_chunks, + min_valid_observations=min_valid_observations, + min_wet_thresholds=min_wet_thresholds, + land_sea_mask_fp=land_sea_mask_fp, + ) + if raster_polygons.empty: + _log.info(f"Tile {str(tile_id)} contains no water body polygons.") + else: + # Drop the attributes column if it exists. + raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True) + # Write the polygons to parquet files. + raster_polygons.to_parquet(raster_polygons_fp) + _log.info( + f"Tile {str(tile_id)} water body polygons written to {raster_polygons_fp}" + ) + except Exception as error: + _log.exception(f"\nTile {str(tile_id)} did not run. \n") + _log.exception(error) + + # Get the extent for each tile. + crs = grid_workflow.grid_spec.crs + tile_ids = [tile[0] for tile in tiles.items()] + tile_extents_geoms = [tile[1].geobox.extent.geom for tile in tiles.items()] + tile_extents_gdf = gpd.GeoDataFrame( + {"tile_id": tile_ids, "geometry": tile_extents_geoms}, crs=crs + ) + + tile_extents_fp = os.path.join(intermediate_outputs_dir, "tile_boundaries.parquet") + + tile_extents_gdf.to_parquet(tile_extents_fp) + _log.info(f"Tile boundaries written to {tile_extents_fp}") + + # Find all parquet files for the first set of polygons. + raster_polygon_paths = find_parquet_files( + path=polygons_from_thresholds_dir, pattern=".*raster_polygons.*" + ) + _log.info(f"Found {len(raster_polygon_paths)} parquet files for the raster polygons.") + + # Load all polygons into a single GeoDataFrame. + _log.info("Loading the raster polygons parquet files..") + raster_polygon_polygons_list = [] + for path in raster_polygon_paths: + gdf = gpd.read_parquet(path) + raster_polygon_polygons_list.append(gdf) + + raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True) + _log.info(f"Found {len(raster_polygons)} raster polygons.") + + _log.info("Merging raster waterbody polygons located at tile boundaries...") + raster_polygons_merged = merge_polygons_at_tile_boundaries(raster_polygons, tile_extents_gdf) + # Drop the attributes column if it exists. + raster_polygons_merged.drop(columns=["attribute"], errors="ignore", inplace=True) + _log.info( + f"Raster polygons count after merging polygons at tile boundaries {len(raster_polygons_merged)}." + ) + + _log.info("Writing raster polygons merged at tile boundaries to disk..") + raster_polygons_merged_fp = os.path.join( + intermediate_outputs_dir, "raster_polygons_merged_at_tile_boundaries.parquet" + ) + raster_polygons_merged.to_parquet(raster_polygons_merged_fp) + _log.info(f"Polygons written to {raster_polygons_merged_fp}") + + # Delete to conserve memeory + del raster_polygons + del tile_extents_gdf + + # Filter the polygons by area. + area_filtered_raster_polygons = filter_by_area( + raster_polygons_merged, min_polygon_size=min_polygon_size, max_polygon_size=max_polygon_size + ) + area_filtered_raster_polygons.to_parquet( + os.path.join(intermediate_outputs_dir, "area_filtered_raster_polygons.parquet") + ) + + waterbodies_gdf = assign_unique_ids(polygons=area_filtered_raster_polygons, precision=10) + + waterbodies_gdf = add_polygon_properties(polygons=waterbodies_gdf) + + waterbodies_gdf = add_timeseries_attribute( + polygons=waterbodies_gdf, + timeseries_directory=timeseries_directory, + region_code="af-south-1", + ) + + # Reproject to EPSG:4326 + waterbodies_gdf_4326 = waterbodies_gdf.to_crs("EPSG:4326") + + # Write to disk. + write_waterbodies_to_file( + waterbodies_gdf=waterbodies_gdf_4326, + output_directory=final_outputs_dir, + file_name_prefix=file_name_prefix, + ) + + waterbodies_gdf_4326.to_parquet(os.path.join(final_outputs_dir, f"{file_name_prefix}.parquet")) + + if group_by_wofs_ls_regions: + waterbodies_gdf_4326 = filter_by_length( + polygons_gdf=waterbodies_gdf_4326, length_threshold_km=length_threshold_km + ) + + split_by_region_fps = split_polygons_by_region( # noqa F841 + polygons_gdf=waterbodies_gdf_4326, + output_directory=polygons_split_by_region_dir, + product="wofs_ls", + ) diff --git a/deafrica_waterbodies/cli/generate_timeseries.py b/deafrica_waterbodies/cli/generate_timeseries.py index c08a9816..ee7e77a0 100644 --- a/deafrica_waterbodies/cli/generate_timeseries.py +++ b/deafrica_waterbodies/cli/generate_timeseries.py @@ -1,10 +1,7 @@ import click -from datacube.ui.click import parse_expressions from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.waterbodies.timeseries.make_timeseries import ( - generate_timeseries_from_wofs_ls, -) +from deafrica_waterbodies.make_timeseries import generate_timeseries_from_wofs_ls @click.command( @@ -14,7 +11,6 @@ @click.option( "--waterbodies-vector-file", type=click.Path(), - default=None, help="REQUIRED. Path to the waterbody polygons vector file you " "want to run the time series generation for.", ) @@ -40,20 +36,11 @@ "then --start-date and --end-date must also be specified.", ) @click.option( - "--start-date", - type=str, - default=None, - help="Date string. E.g. 2019-01-01. " - "The start date for the waterbody timeseries query. If --start-date " - "is provided --end-date must also be provided.", -) -@click.option( - "--end-date", + "--temporal-range", type=str, default=None, - help="Date string. E.g. 2019-12-01. " - "The end date for the waterbody timeseries query. If --end-date is " - "provided --start-date must also be provided.", + help="Time range to generate the timeseries for, if `time_span` is set to" + "`custom`. Example '2020-05--P1M' for the month of May 2020, by default", ) @click.option( "--missing-only/--not-missing-only", @@ -64,14 +51,6 @@ "every waterbody polygon in the --waterbodies-vector-file file, and overwrite " "any existing csv files.", ) -@click.option( - "--include-uncertainity/--do-not-include-uncertainity", - default=True, - help="Option to include uncertainities in the output timeseries." - "If you specify --include-uncertainity then you will only " - "filter out timesteps with 100% invalid pixels. Else you will " - "filter out timesteps with more than 10% invalid pixels", -) @click.option( "--subset-polygon-ids", default=None, @@ -83,10 +62,8 @@ def generate_timeseries( use_id, output_directory, time_span, - start_date, - end_date, + temporal_range, missing_only, - include_uncertainity, subset_polygon_ids, verbose, ): @@ -95,14 +72,11 @@ def generate_timeseries( """ logging_setup(verbose=verbose) - # Convert strings to datetime. - if time_span == "custom": - time_expression = parse_expressions(f"time in [{start_date}, {end_date}]") - start_date_dt = time_expression["time"].begin - end_date_dt = time_expression["time"].end + # Parse string to list. + if subset_polygon_ids is not None: + subset_polygon_ids = subset_polygon_ids.split(",") else: - start_date_dt = None - end_date_dt = None + subset_polygon_ids = [] generate_timeseries_from_wofs_ls( waterbodies_vector_file=waterbodies_vector_file, @@ -110,8 +84,6 @@ def generate_timeseries( use_id=use_id, missing_only=missing_only, time_span=time_span, - start_date=start_date_dt, - end_date=end_date_dt, + temporal_range=temporal_range, subset_polygons_ids=subset_polygon_ids, - include_uncertainity=include_uncertainity, ) diff --git a/deafrica_waterbodies/cli/get_dataset_ids.py b/deafrica_waterbodies/cli/get_dataset_ids.py deleted file mode 100644 index e63d8f7c..00000000 --- a/deafrica_waterbodies/cli/get_dataset_ids.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging - -import click -import fsspec -import geopandas as gpd - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.datasets import get_datasets_ids -from deafrica_waterbodies.io import check_if_s3_uri - - -@click.command("get-dataset-ids", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--aoi-vector-file", - type=str, - default=None, - help="Path to the vector file defining the area of interest.", -) -@click.option( - "--num-workers", type=int, help="Number of worker processes to use when filtering datasets." -) -@click.option( - "--dataset-ids-text-file", - type=click.Path(), - help="File URI or S3 URI of the text file to write the dataset ids to.", -) -def get_dataset_ids( - verbose, - aoi_vector_file, - num_workers, - dataset_ids_text_file, -): - """ - Get the dataset ids of the WOfS All Time summary datasets/scenes to generate - the waterbody polygons for. - """ - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib Paths. - aoi_vector_file = str(aoi_vector_file) - dataset_ids_text_file = str(dataset_ids_text_file) - - # Load the area of interest as a GeoDataFrame. - if aoi_vector_file is not None: - try: - aoi_gdf = gpd.read_file(aoi_vector_file) - except Exception as error: - _log.exception(f"Could not read the file {aoi_vector_file}") - raise error - else: - aoi_gdf = None - - # Get the WOfS All Time Summary scene ids for the scenes whose extent - # intersects with the area of interest. - dataset_ids = get_datasets_ids(aoi_gdf=aoi_gdf, num_workers=num_workers) - - # Set the filesystem to use. - if check_if_s3_uri(dataset_ids_text_file): - fs = fsspec.filesystem("s3") - else: - fs = fsspec.filesystem("file") - - # Write the dataset ids to the text file. - with fs.open(dataset_ids_text_file, "w") as file: - for dataset_id in dataset_ids: - file.write(f"{dataset_id}\n") - - _log.info(f"Dataset IDs written to: {dataset_ids_text_file}.") diff --git a/deafrica_waterbodies/cli/group_options.py b/deafrica_waterbodies/cli/group_options.py deleted file mode 100644 index ffd9d2cc..00000000 --- a/deafrica_waterbodies/cli/group_options.py +++ /dev/null @@ -1,43 +0,0 @@ -import click -from click import Option, UsageError - - -# From https://gist.github.com/jacobtolar/fb80d5552a9a9dfc32b12a829fa21c0c -class MutuallyExclusiveOption(Option): - def __init__(self, *args, **kwargs): - self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", [])) - help = kwargs.get("help", "") - if self.mutually_exclusive: - ex_str = ", ".join(self.mutually_exclusive) - kwargs["help"] = help + ( - " NOTE: This argument is mutually exclusive with " " arguments: [" + ex_str + "]." - ) - super(MutuallyExclusiveOption, self).__init__(*args, **kwargs) - - def handle_parse_result(self, ctx, opts, args): - if self.mutually_exclusive.intersection(opts) and self.name in opts: - raise UsageError( - "Illegal usage: `{}` is mutually exclusive with " - "arguments `{}`.".format(self.name, ", ".join(self.mutually_exclusive)) - ) - - return super(MutuallyExclusiveOption, self).handle_parse_result(ctx, opts, args) - - -def command_required_option_from_option(require_name, require_map): - class CommandOptionRequiredClass(click.Command): - def invoke(self, ctx): - require = ctx.params[require_name] - if require not in require_map: - raise click.ClickException( - "Unexpected value for --'{}': {}".format(require_name, require) - ) - if ctx.params[require_map[require].lower()] is None: - raise click.ClickException( - "With {}={} must specify option --{}".format( - require_name, require, require_map[require] - ) - ) - super(CommandOptionRequiredClass, self).invoke(ctx) - - return CommandOptionRequiredClass diff --git a/deafrica_waterbodies/cli/main.py b/deafrica_waterbodies/cli/main.py index e498f9cd..b1f56e88 100644 --- a/deafrica_waterbodies/cli/main.py +++ b/deafrica_waterbodies/cli/main.py @@ -1,13 +1,8 @@ import click import deafrica_waterbodies.__version__ -from deafrica_waterbodies.cli.filter_waterbody_polygons import filter_waterbody_polygons -from deafrica_waterbodies.cli.get_dataset_ids import get_dataset_ids -from deafrica_waterbodies.cli.merge_polygons_at_ds_boundaries import merge_polygons_at_ds_boundaries -from deafrica_waterbodies.cli.push_to_sqs_queue import push_to_sqs_queue -from deafrica_waterbodies.cli.run_from_sqs_queue import run_from_sqs_queue -from deafrica_waterbodies.cli.run_from_txt import run_from_txt -from deafrica_waterbodies.cli.write_final_output import write_final_output +from deafrica_waterbodies.cli.generate_polygons import generate_polygons +from deafrica_waterbodies.cli.generate_timeseries import generate_timeseries @click.version_option(package_name="deafrica_waterbodies", version=deafrica_waterbodies.__version__) @@ -16,10 +11,5 @@ def main(): pass -main.add_command(get_dataset_ids) -main.add_command(push_to_sqs_queue) -main.add_command(run_from_sqs_queue) -main.add_command(run_from_txt) -main.add_command(merge_polygons_at_ds_boundaries) -main.add_command(filter_waterbody_polygons) -main.add_command(write_final_output) +main.add_command(generate_polygons) +main.add_command(generate_timeseries) diff --git a/deafrica_waterbodies/cli/merge_polygons_at_ds_boundaries.py b/deafrica_waterbodies/cli/merge_polygons_at_ds_boundaries.py deleted file mode 100644 index ed021594..00000000 --- a/deafrica_waterbodies/cli/merge_polygons_at_ds_boundaries.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging -import os - -import click -import geopandas as gpd -import pandas as pd - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.io import find_parquet_files -from deafrica_waterbodies.make_polygons import merge_polygons_at_dataset_boundaries - - -@click.command("merge-polygons-at-ds-boundaries", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--output-directory", - type=click.Path(), - help="Directory containing the waterbody polygons.", -) -def merge_polygons_at_ds_boundaries(verbose, output_directory): - """ - Merge polygons at dataset boundaries. - """ - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib paths. - output_directory = str(output_directory) - - # Directory containing the water body polygons generated from - # thresholding WOfS All time summary datasets. - polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds") - - # Find all parquet files for the primary threshold. - primary_threshold_polygons_paths = find_parquet_files( - path=polygons_from_thresholds_dir, pattern=".*primary.*" - ) - _log.info(f"Found {len(primary_threshold_polygons_paths)} primary threshold polygons.") - - # Load all the primary threshold polygons into a single GeoDataFrame. - _log.info("Loading the primary threshold polygons parquet files..") - primary_threshold_polygons_list = [] - for path in primary_threshold_polygons_paths: - gdf = gpd.read_parquet(path) - primary_threshold_polygons_list.append(gdf) - - primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True) - _log.info(f"Found {len(primary_threshold_polygons)} primary threshold polygons.") - - _log.info("Merging primary threshold waterbody polygons located at dataset/scene boundaries...") - primary_threshold_polygons_merged = merge_polygons_at_dataset_boundaries( - primary_threshold_polygons - ) - _log.info(f"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.") - - _log.info("Writing primary threshold polygons merged at dataset boundaries to disk..") - primary_threshold_polygons_output_fp = os.path.join( - output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet" - ) - - primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp) - _log.info(f"Polygons written to {primary_threshold_polygons_output_fp}") - - # Find all parquet files for the secondary threshold. - secondary_threshold_polygons_paths = find_parquet_files( - path=polygons_from_thresholds_dir, pattern=".*secondary.*" - ) - _log.info( - f"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons." - ) - - # Load all the secondary threshold polygons into a single GeoDataFrame. - _log.info("Loading the secondary threshold polygons parquet files...") - secondary_threshold_polygons_list = [] - for path in secondary_threshold_polygons_paths: - gdf = gpd.read_parquet(path) - secondary_threshold_polygons_list.append(gdf) - - secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True) - _log.info(f"Found {len(secondary_threshold_polygons)} secondary threshold polygons.") - - _log.info( - "Merging secondary threshold waterbody polygons located at dataset/scene boundaries..." - ) - secondary_threshold_polygons_merged = merge_polygons_at_dataset_boundaries( - secondary_threshold_polygons - ) - _log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.") - - _log.info("Writing secondary threshold polygons merged at dataset boundaries to disk..") - secondary_threshold_polygons_output_fp = os.path.join( - output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet" - ) - - secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp) - - _log.info(f"Polygons written to {secondary_threshold_polygons_output_fp}") diff --git a/deafrica_waterbodies/cli/push_to_sqs_queue.py b/deafrica_waterbodies/cli/push_to_sqs_queue.py deleted file mode 100644 index 62f4f72b..00000000 --- a/deafrica_waterbodies/cli/push_to_sqs_queue.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging - -import boto3 -import click - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.queues import ( - get_queue_url, - move_to_dead_letter_queue, - push_dataset_ids_to_queue_from_txt, -) - - -@click.command("push-to-sqs-queue", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--dataset-ids-text-file", - type=click.Path(), - required=True, - help="Path to dataset ids text file.", -) -@click.option( - "--dataset-ids-queue", required=True, help="Name of the queue to push the dataset ids to." -) -@click.option( - "--max-retries", - default=10, - help="Maximum number of times to retry sending/receiving messages to/from a SQS queue.", -) -def push_to_sqs_queue(verbose, dataset_ids_text_file, dataset_ids_queue, max_retries): - """ - Push dataset ids from the lines of a text file to a SQS queue. - """ - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) # noqa F841 - - # Create an sqs client. - sqs_client = boto3.client("sqs") - - # Support pathlib paths. - dataset_ids_text_file = str(dataset_ids_text_file) - - failed_to_push = push_dataset_ids_to_queue_from_txt( - text_file_path=dataset_ids_text_file, - queue_name=dataset_ids_queue, - max_retries=max_retries, - sqs_client=sqs_client, - ) - - if failed_to_push: - # Push the failed dataset ids to the deadletter queue. - dead_letter_queue_name = f"{dataset_ids_queue}-deadletter" - dead_letter_queue_url = get_queue_url( - queue_name=dead_letter_queue_name, sqs_client=sqs_client - ) - - for idx in failed_to_push: - move_to_dead_letter_queue( - dead_letter_queue_url=dead_letter_queue_url, - message_body=idx, - max_retries=max_retries, - sqs_client=sqs_client, - ) diff --git a/deafrica_waterbodies/cli/run_from_sqs_queue.py b/deafrica_waterbodies/cli/run_from_sqs_queue.py deleted file mode 100644 index 34e24c09..00000000 --- a/deafrica_waterbodies/cli/run_from_sqs_queue.py +++ /dev/null @@ -1,239 +0,0 @@ -import logging -import os - -import boto3 -import click -import datacube -import fsspec -from rasterio.errors import RasterioIOError - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.io import check_dir_exists, check_file_exists, check_if_s3_uri -from deafrica_waterbodies.make_polygons import check_wetness_thresholds, get_polygons_from_dataset -from deafrica_waterbodies.queues import ( - delete_batch_with_retry, - get_queue_url, - move_to_dead_letter_queue, - receive_a_message, -) - - -@click.command("run-from-sqs-queue", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--primary-threshold", - default=0.1, - type=float, - help="Threshold to define the location of the waterbody polygons.", - show_default=True, -) -@click.option( - "--secondary-threshold", - default=0.05, - type=float, - help="Threshold to define the extent/shape of the waterbody polygons.", - show_default=True, -) -@click.option( - "--minimum-valid-observations", - default=128, - type=int, - help="Minimum number of observations for a pixel to be considered valid.", - show_default=True, -) -@click.option( - "--output-directory", - type=click.Path(), - help="Directory to write the waterbody polygons to.", -) -@click.option( - "--dataset-ids-queue", - type=str, - help="Name of the SQS queue to read dataset IDs from.", -) -@click.option( - "--visibility-timeout", - default=18 * 60, - help="The duration in seconds that a received SQS msg is invisible.", -) -@click.option( - "--max-retries", - default=10, - help="Maximum number of times to retry sending/receiving messages to/from a SQS queue.", -) -@click.option( - "--overwrite/--no-overwrite", - default=False, - help="Rerun scenes that have already been processed.", -) -def run_from_sqs_queue( - verbose, - primary_threshold, - secondary_threshold, - minimum_valid_observations, - output_directory, - dataset_ids_queue, - visibility_timeout, - max_retries, - overwrite, -): - """ - Generate waterbody polygons from WOfS All Time Summary scenes whose ids are - in a SQS queue. - """ - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib paths. - output_directory = str(output_directory) - - dask_chunks = {"x": 3200, "y": 3200, "time": 1} - resolution = (-30, 30) - output_crs = "EPSG:6933" - - # Set the filesystem to use. - if check_if_s3_uri(output_directory): - fs = fsspec.filesystem("s3") - else: - fs = fsspec.filesystem("file") - - # Directory to write generated waterbody polygons to. - polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds") - - # Check if the directory exists. If it does not, create it. - if not check_dir_exists(polygons_from_thresholds_dir): - fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True) - _log.info(f"Created directory {polygons_from_thresholds_dir}") - - # Check if the wetness thresholds have been set correctly. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] - _log.info(check_wetness_thresholds(minimum_wet_thresholds)) - - # Connect to the datacube. - dc = datacube.Datacube(app="GenerateWaterbodyPolygons") - - # Create the service client. - sqs_client = boto3.client("sqs") - - dataset_ids_queue_url = get_queue_url(queue_name=dataset_ids_queue, sqs_client=sqs_client) - # Get the dead-letter queue. - dead_letter_queue_name = f"{dataset_ids_queue}-deadletter" - dead_letter_queue_url = get_queue_url(queue_name=dead_letter_queue_name, sqs_client=sqs_client) - - retries = 0 - while retries <= max_retries: - # Retrieve a single message from the dataset_ids_queue. - message = receive_a_message( - queue_url=dataset_ids_queue_url, - max_retries=max_retries, - visibility_timeout=visibility_timeout, - sqs_client=sqs_client, - ) - if message is None: - retries += 1 - else: - retries = 0 # reset the count - - # Process the ID. - dataset_id = message["Body"] - _log.info(f"Read dataset id {dataset_id} from queue {dataset_ids_queue_url}") - - entry_to_delete = [ - {"Id": message["MessageId"], "ReceiptHandle": message["ReceiptHandle"]} - ] - - # Produce the primary and secondary threshold polygons. - success_flag = True - - primary_threshold_polygons_fp = os.path.join( - polygons_from_thresholds_dir, f"{dataset_id}_primary_threshold_polygons.parquet" - ) - secondary_threshold_polygons_fp = os.path.join( - polygons_from_thresholds_dir, f"{dataset_id}_secondary_threshold_polygons.parquet" - ) - - if not overwrite: - _log.info( - f"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}" - ) - exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists( - secondary_threshold_polygons_fp - ) - - if overwrite or not exists: - try: - ( - primary_threshold_polygons, - secondary_threshold_polygons, - ) = get_polygons_from_dataset( - dataset_id=dataset_id, - dask_chunks=dask_chunks, - resolution=resolution, - output_crs=output_crs, - min_valid_observations=minimum_valid_observations, - primary_threshold=primary_threshold, - secondary_threshold=secondary_threshold, - dc=dc, - ) - - # Write the polygons to parquet files. - primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp) - secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp) - - except KeyError as keyerr: - _log.exception(f"Found {dataset_id} has KeyError: {str(keyerr)}") - _log.error(f"Moving {dataset_id} to deadletter queue {dead_letter_queue_url}") - move_to_dead_letter_queue( - dead_letter_queue_url=dead_letter_queue_url, - message_body=dataset_id, - sqs_client=sqs_client, - ) - success_flag = False - except TypeError as typeerr: - _log.exception(f"Found {dataset_id} has TypeError: {str(typeerr)}") - _log.error(f"Moving {dataset_id} to deadletter queue {dead_letter_queue_url}") - move_to_dead_letter_queue( - dead_letter_queue_url=dead_letter_queue_url, - message_body=dataset_id, - sqs_client=sqs_client, - ) - success_flag = False - except RasterioIOError as ioerror: - _log.exception(f"Found {dataset_id} has RasterioIOError: {str(ioerror)}") - _log.error(f"Moving {dataset_id} to deadletter queue {dead_letter_queue_url}") - move_to_dead_letter_queue( - dead_letter_queue_url=dead_letter_queue_url, - message_body=dataset_id, - sqs_client=sqs_client, - ) - success_flag = False - else: - _log.info( - f"{primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp} already exist, skipping" - ) - - # Delete datased id from queue. - if success_flag: - _log.info(f"Successful, deleting {dataset_id}") - ( - successfully_deleted, - failed_to_delete, - ) = delete_batch_with_retry( - queue_url=dataset_ids_queue_url, - entries=entry_to_delete, - max_retries=max_retries, - sqs_client=sqs_client, - ) - if failed_to_delete: - _log.error( - f"Failed to delete dataset id {dataset_id} from queue {dataset_ids_queue_url}" - ) - raise RuntimeError(f"Failed to delete dataset id: {dataset_id}") - else: - _log.info(f"Deleted dataset id {dataset_id} from queue") - - else: - _log.info( - f"Not successful, moved {dataset_id} to dead letter queue {dead_letter_queue_url}" - ) diff --git a/deafrica_waterbodies/cli/run_from_txt.py b/deafrica_waterbodies/cli/run_from_txt.py deleted file mode 100644 index 7f8778d3..00000000 --- a/deafrica_waterbodies/cli/run_from_txt.py +++ /dev/null @@ -1,145 +0,0 @@ -import logging -import os - -import click -import datacube -import fsspec - -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.io import check_dir_exists, check_file_exists, check_if_s3_uri -from deafrica_waterbodies.make_polygons import check_wetness_thresholds, get_polygons_from_dataset - - -@click.command("run-from-txt", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--primary-threshold", - default=0.1, - type=float, - help="Threshold to define the location of the waterbody polygons.", - show_default=True, -) -@click.option( - "--secondary-threshold", - default=0.05, - type=float, - help="Threshold to define the extent/shape of the waterbody polygons.", - show_default=True, -) -@click.option( - "--minimum-valid-observations", - default=128, - type=int, - help="Minimum number of observations for a pixel to be considered valid.", - show_default=True, -) -@click.option( - "--output-directory", - type=click.Path(), - help="Directory to write the waterbody polygons to.", -) -@click.option( - "--dataset-ids-text-file", - type=click.Path(), - required=True, - help="Path to dataset ids text file.", -) -@click.option( - "--overwrite/--no-overwrite", - default=False, - help="Rerun scenes that have already been processed.", -) -def run_from_txt( - verbose, - primary_threshold, - secondary_threshold, - minimum_valid_observations, - output_directory, - dataset_ids_text_file, - overwrite, -): - """ - Generate waterbody polygons from WOfS All Time Summary scenes whose ids are - in a text file. - """ - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib paths. - output_directory = str(output_directory) - dataset_ids_text_file = str(dataset_ids_text_file) - - dask_chunks = {"x": 3200, "y": 3200, "time": 1} - resolution = (-30, 30) - output_crs = "EPSG:6933" - - # Read the dataset ids from the text file. - if not check_file_exists(dataset_ids_text_file): - _log.error(f"Could not find text file {dataset_ids_text_file}!") - raise FileNotFoundError(f"Could not find text file {dataset_ids_text_file}!") - else: - if check_if_s3_uri(dataset_ids_text_file): - fs = fsspec.filesystem("s3") - else: - fs = fsspec.filesystem("file") - with fs.open(dataset_ids_text_file, "r") as file: - lines = file.readlines() - dataset_ids = [line.strip() for line in lines] - - # Directory to write generated waterbody polygons to. - polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds") - - # Set the filesystem to use. - if check_if_s3_uri(polygons_from_thresholds_dir): - fs = fsspec.filesystem("s3") - else: - fs = fsspec.filesystem("file") - - # Check if the directory exists. If it does not, create it. - if not check_dir_exists(polygons_from_thresholds_dir): - fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True) - _log.info(f"Created directory {polygons_from_thresholds_dir}") - - # Check if the wetness thresholds have been set correctly. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] - _log.info(check_wetness_thresholds(minimum_wet_thresholds)) - - # Connect to the datacube. - dc = datacube.Datacube(app="GenerateWaterbodyPolygons") - - # For each dataset id, threshold the scene to generate the primary and secondary threshold - # waterbody polygons. - for dataset_id in dataset_ids: - primary_threshold_polygons_fp = os.path.join( - polygons_from_thresholds_dir, f"{dataset_id}_primary_threshold_polygons.parquet" - ) - secondary_threshold_polygons_fp = os.path.join( - polygons_from_thresholds_dir, f"{dataset_id}_secondary_threshold_polygons.parquet" - ) - - if not overwrite: - _log.info( - f"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}" - ) - exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists( - secondary_threshold_polygons_fp - ) - - if overwrite or not exists: - ( - primary_threshold_polygons, - secondary_threshold_polygons, - ) = get_polygons_from_dataset( - dataset_id=dataset_id, - dask_chunks=dask_chunks, - resolution=resolution, - output_crs=output_crs, - min_valid_observations=minimum_valid_observations, - primary_threshold=primary_threshold, - secondary_threshold=secondary_threshold, - dc=dc, - ) - # Write the polygons to parquet files. - primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp) - secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp) diff --git a/deafrica_waterbodies/cli/write_final_output.py b/deafrica_waterbodies/cli/write_final_output.py deleted file mode 100644 index af796b66..00000000 --- a/deafrica_waterbodies/cli/write_final_output.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging -import os - -import click -import geopandas as gpd - -from deafrica_waterbodies.attributes import ( - add_area_and_perimeter_attributes, - add_timeseries_attribute, - assign_unique_ids, -) -from deafrica_waterbodies.cli.logs import logging_setup -from deafrica_waterbodies.io import write_waterbodies_to_file - - -@click.command("write-final-output", no_args_is_help=True) -@click.option("-v", "--verbose", count=True) -@click.option( - "--output-directory", - type=click.Path(), - help="Directory containing the waterbody polygons.", -) -@click.option( - "--product-version", - type=str, - default="0.0.1", - show_default=True, - help="Product version for the DE Africa Waterbodies product.", -) -@click.option( - "--timeseries-bucket", - type=str, - show_default=True, - help="The s3 bucket to containing the timeseries for the polygons.", -) -def write_final_output( - verbose, - output_directory, - product_version, - timeseries_bucket, -): - # Set up logger. - logging_setup(verbose=verbose) - _log = logging.getLogger(__name__) - - # Support pathlib paths. - output_directory = str(output_directory) - - _log.info("Loading filtered waterbody polygons...") - filtered_polygons_fp = os.path.join(output_directory, "filtered_polygons.parquet") - filtered_polygons = gpd.read_parquet(filtered_polygons_fp) - _log.info(f"Waterbody polygons count {len(filtered_polygons)}.") - - waterbodies_gdf = assign_unique_ids(polygons=filtered_polygons) - waterbodies_gdf = add_area_and_perimeter_attributes(polygons=waterbodies_gdf) - waterbodies_gdf = add_timeseries_attribute( - polygons=waterbodies_gdf, - product_version=product_version, - timeseries_bucket=timeseries_bucket, - ) - - # Reproject to EPSG:4326 - waterbodies_gdf_4326 = waterbodies_gdf.to_crs("EPSG:4326") - - # Write to disk. - write_waterbodies_to_file( - waterbodies_gdf=waterbodies_gdf_4326, - product_version=product_version, - output_directory=output_directory, - ) diff --git a/deafrica_waterbodies/datasets.py b/deafrica_waterbodies/datasets.py deleted file mode 100644 index 954443e2..00000000 --- a/deafrica_waterbodies/datasets.py +++ /dev/null @@ -1,106 +0,0 @@ -import multiprocessing -from functools import partial - -import datacube -import geopandas as gpd -import tqdm - - -def check_ds_intersects_polygons( - polygons_gdf: gpd.GeoDataFrame | None, ds: datacube.model.Dataset -) -> str: - """ - Check if the extent of a dataset intersects with a set of polygons. - - Parameters - ---------- - polygons_gdf : gpd.GeoDataFrame | None - ds : datacube.model.Dataset - - Returns - ------- - str | None - Dataset id if the dataset's extent intersects with the polygons. - """ - if polygons_gdf is not None: - # Get the extent of the dataset. - ds_extent = ds.extent - # Reproject the extent of the dataset to match the polygons. - ds_extent = ds_extent.to_crs(polygons_gdf.crs) - # Get the shapely geometry of the reprojected extent of the dataset. - ds_extent_geom = ds_extent.geom - # Check if the dataset's extent intersects with any of the polygons. - check_intersection = polygons_gdf.geometry.intersects(ds_extent_geom).any() - if check_intersection: - return str(ds.id) - else: - return "" - else: - return str(ds.id) - - -def filter_datasets( - dss: list[datacube.model.Dataset], polygons_gdf: gpd.GeoDataFrame | None, num_workers: int = 8 -) -> list[str]: - """ - Filter out datasets that do not intersect with a set of polygons, using a - multi-process approach to run the `check_ds_intersects_polygons` function. - - Parameters - ---------- - dss : list[datacube.model.Dataset] - A list of Datasets to filter. - polygons_gdf : gpd.GeoDataFrame | None - A set of polygons in a GeoDataFrame - num_workers : int, optional - Number of worker processes to use during multi-processing, by default 8 - - Returns - ------- - list[str] - A list of the filtered datasets ids. - """ - with multiprocessing.Pool(processes=num_workers) as pool: - filtered_datasets_ids_ = list( - tqdm.tqdm(pool.imap(partial(check_ds_intersects_polygons, polygons_gdf), dss)) - ) - - # Remove empty strings. - filtered_datasets_ids = [item for item in filtered_datasets_ids_ if item] - - return filtered_datasets_ids - - -def get_datasets_ids( - aoi_gdf: gpd.GeoDataFrame | None, dc: datacube.Datacube | None = None, num_workers: int = 8 -) -> list[str]: - """ - Get the dataset ids of the WOfS All Time Summary datasets whose extents intersect - with any of the area of interest polygons. - - Parameters - ---------- - aoi_gdf : gpd.GeoDataFrame | None - Area of interest - dc : datacube.Datacube | None, optional - Datacube connection, by default None - num_workers : int, optional - Number of worker processes to use when filtering datasets, by default 8 - Returns - ------- - list[str] - Dataset ids of the WOfS All Time Summary datasets whose extents intersect - with any of the area of interest polygons. - """ - # Connect to the datacube. - if dc is None: - dc = datacube.Datacube(app="WaterbodiesPolygons") - - # Find all datasets available for the WOfS All Time summary product. - dss = dc.index.datasets.search(product=["wofs_ls_summary_alltime"]) - dss = list(dss) - - # Filter the datasets to the area of interest. - filtered_datasets_ids = filter_datasets(dss, aoi_gdf, num_workers) - - return filtered_datasets_ids diff --git a/deafrica_waterbodies/filters.py b/deafrica_waterbodies/filters.py index 580efbd7..4ddc3dcd 100644 --- a/deafrica_waterbodies/filters.py +++ b/deafrica_waterbodies/filters.py @@ -3,13 +3,11 @@ """ import logging import math -import warnings -from pathlib import Path import geopandas as gpd import numpy as np import pandas as pd -import xarray as xr +from shapely.geometry import MultiPolygon, Polygon _log = logging.getLogger(__name__) @@ -89,19 +87,17 @@ def filter_by_intersection( def filter_by_area( - primary_threshold_polygons: gpd.GeoDataFrame | None, - secondary_threshold_polygons: gpd.GeoDataFrame | None, + polygons_gdf: gpd.GeoDataFrame, min_polygon_size: float = 4500, max_polygon_size: float = math.inf, -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +) -> gpd.GeoDataFrame: """ - Filter the primary and secondary threshold polygons using the minimum and + Filter a set of water body polygons using the minimum and maximum area. Parameters ---------- - primary_threshold_polygons : gpd.GeoDataFrame - secondary_threshold_polygons : gpd.GeoDataFrame + polygons_gdf : gpd.GeoDataFrame min_polygon_size : float, optional Minimum area of a waterbody polygon to be included in the output polygons, by default 4500 max_polygon_size : float, optional @@ -109,544 +105,206 @@ def filter_by_area( Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: - The area filtered primary threshold polygons and the area filtered - secondary threshold polygons. + gpd.GeoDataFrame: + The area filtered water body polygons. """ - if primary_threshold_polygons is not None and secondary_threshold_polygons is not None: - assert primary_threshold_polygons.crs == secondary_threshold_polygons.crs - - try: - crs = primary_threshold_polygons.crs - except Exception: - crs = secondary_threshold_polygons.crs - + crs = polygons_gdf.crs assert crs.is_projected - if primary_threshold_polygons is not None: - _log.info( - f"Filtering primary threshold polygons by minimum area {min_polygon_size} and max area {max_polygon_size}..." - ) + _log.info( + f"Filtering {len(polygons_gdf)} polygons by minimum area {min_polygon_size} and max area {max_polygon_size}..." + ) - primary_threshold_polygons["area"] = pd.to_numeric(primary_threshold_polygons.area) - area_filtered_primary_threshold_polygons = primary_threshold_polygons.loc[ - ( - (primary_threshold_polygons["area"] > min_polygon_size) - & (primary_threshold_polygons["area"] <= max_polygon_size) - ) - ] - area_filtered_primary_threshold_polygons.reset_index(drop=True, inplace=True) - _log.info( - f"Filtered out {len(primary_threshold_polygons) - len(area_filtered_primary_threshold_polygons)} primary threshold polygons." + polygons_gdf["area_m2"] = pd.to_numeric(polygons_gdf.area) + area_filtered_polygons_gdf = polygons_gdf.loc[ + ( + (polygons_gdf["area_m2"] > min_polygon_size) + & (polygons_gdf["area_m2"] <= max_polygon_size) ) - else: - area_filtered_primary_threshold_polygons = None - - if secondary_threshold_polygons is not None: - _log.info(f"Filtering secondary threshold polygons by max area {max_polygon_size}...") + ] + area_filtered_polygons_gdf = gpd.GeoDataFrame(data=area_filtered_polygons_gdf) - secondary_threshold_polygons["area"] = pd.to_numeric(secondary_threshold_polygons.area) - area_filtered_secondary_threshold_polygons = secondary_threshold_polygons.loc[ - secondary_threshold_polygons["area"] <= max_polygon_size - ] - area_filtered_secondary_threshold_polygons.reset_index(drop=True, inplace=True) - _log.info( - f"Filtered out {len(secondary_threshold_polygons) - len(area_filtered_secondary_threshold_polygons)} secondary threshold polygons." - ) - else: - area_filtered_secondary_threshold_polygons = None + _log.info(f"Filtered out {len(polygons_gdf) - len(area_filtered_polygons_gdf)} polygons.") - return area_filtered_primary_threshold_polygons, area_filtered_secondary_threshold_polygons + return area_filtered_polygons_gdf -def filter_using_land_sea_mask( - primary_threshold_polygons: gpd.GeoDataFrame, - secondary_threshold_polygons: gpd.GeoDataFrame, - land_sea_mask_fp: str | Path = "", -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +def pp_test_gdf(input_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ - Filter the primary and secondary threshold waterbody polygons using a land/sea - mask to filter out ocean polygons. + Function to calculate the Polsby–Popper test values on a + geopandas GeoDataFrame. Parameters ---------- - primary_threshold_polygons : gpd.GeoDataFrame - secondary_threshold_polygons : gpd.GeoDataFrame - land_sea_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out ocean waterbody polygons, by default "" + input_gdf : gpd.GeoDataFrame + Polygons to calculate the Polsby–Popper test values for. Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: - The filtered primary threshold polygons and the filtered - secondary threshold polygons with ocean polygons removed. + gpd.GeoDataFrame + Polygons GeoDataFrame with a column `pp_test` containing the Polsby–Popper test values + for each polygon. """ - assert primary_threshold_polygons.crs == secondary_threshold_polygons.crs - crs = primary_threshold_polygons.crs - - # Support pathlib Paths - land_sea_mask_fp = str(land_sea_mask_fp) + crs = input_gdf.crs + assert crs.is_projected - if land_sea_mask_fp: - _log.info( - "Filtering out ocean polygons from the primary and secondary threshold waterbody polygons..." - ) - try: - land_sea_mask = gpd.read_file(land_sea_mask_fp).to_crs(crs) - except Exception as error: - _log.exception(f"Could not read file {land_sea_mask_fp}") - raise error - else: - inland_primary_threshold_polygons = filter_by_intersection( - gpd_data=primary_threshold_polygons, - gpd_filter=land_sea_mask, - filtertype="intersects", - invert_mask=True, - return_inverse=False, - ) - _log.info( - f"Filtered out {len(primary_threshold_polygons) - len(inland_primary_threshold_polygons)} primary threshold polygons." - ) - - inland_secondary_threshold_polygons = filter_by_intersection( - gpd_data=secondary_threshold_polygons, - gpd_filter=land_sea_mask, - filtertype="intersects", - invert_mask=True, - return_inverse=False, - ) - _log.info( - f"Filtered out {len(secondary_threshold_polygons) - len(inland_secondary_threshold_polygons)} secondary threshold polygons." - ) - - return inland_primary_threshold_polygons, inland_secondary_threshold_polygons + input_gdf["area"] = input_gdf["geometry"].area + input_gdf["perimeter"] = input_gdf["geometry"].length + input_gdf["pp_test"] = (4 * math.pi * input_gdf["area"]) / (input_gdf["perimeter"] ** 2) - else: - _log.info("Skipping filtering out ocean polygons step.") - return primary_threshold_polygons, secondary_threshold_polygons + return input_gdf -def filter_using_urban_mask( - primary_threshold_polygons: gpd.GeoDataFrame, - secondary_threshold_polygons: gpd.GeoDataFrame, - urban_mask_fp: str | Path = "", -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +# From https://stackoverflow.com/a/70387141 +def remove_polygon_interiors(poly: Polygon) -> Polygon: """ - Filter out the missclassified waterbodies from the primary and secondary - threshold polygons using an urban/CBDs mask. - WOfS has a known limitation, where deep shadows thrown by tall CBD buildings - are misclassified as water. This results in 'waterbodies' around these - misclassified shadows in capital cities. + Close polygon holes by limitation to the exterior ring. Parameters ---------- - primary_threshold_polygons : gpd.GeoDataFrame - secondary_threshold_polygons : gpd.GeoDataFrame - urban_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out CBDs, by default "" + poly : Polygon + Input Polygon. Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: - Primary threshold polygons with missclassified waterbodies removed. + Polygon + Input Polygon without any interior holes. """ - crs = primary_threshold_polygons.crs - - if urban_mask_fp: - _log.info( - "Filtering out CBDs polygons from the primary and secondary threshold polygons..." - ) - try: - urban_mask = gpd.read_file(urban_mask_fp).to_crs(crs) - except Exception as error: - _log.exception(f"Could not read file {urban_mask_fp}") - raise error - else: - cbd_filtered_primary_threshold_polygons = filter_by_intersection( - gpd_data=primary_threshold_polygons, - gpd_filter=urban_mask, - filtertype="intersects", - invert_mask=True, - return_inverse=False, - ) - - _log.info( - f"Filtered out {len(primary_threshold_polygons) - len(cbd_filtered_primary_threshold_polygons)} primary threshold polygons." - ) - - cbd_filtered_secondary_threshold_polygons = filter_by_intersection( - gpd_data=secondary_threshold_polygons, - gpd_filter=urban_mask, - filtertype="intersects", - invert_mask=True, - return_inverse=False, - ) - - _log.info( - f"Filtered out {len(secondary_threshold_polygons) - len(cbd_filtered_secondary_threshold_polygons)} secondary threshold polygons." - ) - - return ( - cbd_filtered_primary_threshold_polygons, - cbd_filtered_secondary_threshold_polygons, - ) + if len(poly.interiors) > 0: + return Polygon(list(poly.exterior.coords)) else: - _log.info("Skipping filtering out CBDs step.") - return primary_threshold_polygons, secondary_threshold_polygons + return poly -def merge_primary_and_secondary_threshold_polygons( - primary_threshold_polygons: gpd.GeoDataFrame, - secondary_threshold_polygons: gpd.GeoDataFrame, -) -> gpd.GeoDataFrame: +def get_largest_polygon(poly_list: list) -> Polygon: """ - Identify secondary threshold polygons that intersect with the primary threshold - polygons and merge them with the primary threshold polygons. + Get the largest polygon from a list of polygons. Parameters ---------- - primary_threshold_polygons : gpd.GeoDataFrame - secondary_threshold_polygons : gpd.GeoDataFrame + poly_list : list + List of polygons to filter. Returns ------- - gpd.GeoDataFrame - Merged primary and secondary threshold polygons. + Polygon + The largest polygon by area from the list of polygons. """ - assert primary_threshold_polygons.crs == secondary_threshold_polygons.crs - crs = primary_threshold_polygons.crs - - _log.info("Merging the primary threshold and secondary threshold polygons...") - # Find the polygons identified using the secondary threshold that intersect with those identified - # using the primary threshold. - do_intersect_with_primary = filter_by_intersection( - gpd_data=secondary_threshold_polygons, - gpd_filter=primary_threshold_polygons, - filtertype="intersects", - invert_mask=False, - return_inverse=False, - ) - - # Combine the identified polygons with the primary threshold polygons. - combined_polygons = gpd.GeoDataFrame( - pd.concat([do_intersect_with_primary, primary_threshold_polygons], ignore_index=True) - ) - # Merge overlapping polygons. - merged_combined_polygons_geoms = combined_polygons.unary_union - # `Explode` the multipolygon back out into individual polygons. - merged_combined_polygons = gpd.GeoDataFrame(crs=crs, geometry=[merged_combined_polygons_geoms]) - merged_combined_polygons = merged_combined_polygons.explode(index_parts=True) - - _log.info(f"Waterbody polygons count after merge: {len(merged_combined_polygons)}.") - return merged_combined_polygons - - -def filter_using_major_rivers_mask( - waterbody_polygons: gpd.GeoDataFrame, major_rivers_mask_fp: str | Path = "" -) -> gpd.GeoDataFrame: + # Get the area of each polygon in the list. + poly_areas = [poly.area for poly in poly_list] + # Get the largest area. + max_area = max(poly_areas) + # Get the index for the largest area. + max_area_idx = poly_areas.index(max_area) + # Use the index to get the largest polygon. + largest_polygon = poly_list[max_area_idx] + return largest_polygon + + +def fill_holes(geom: Polygon | MultiPolygon) -> Polygon | MultiPolygon: """ - Filter out major rivers polygons from a set of waterbody polygons. + Fill holes in polygon. Parameters ---------- - waterbody_polygons : gpd.GeoDataFrame - major_rivers_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out major river waterbody polygons, by default "" + geom : Polygon | MultiPolygon + Polygon or MultiPolygon to fill holes in. Returns ------- - gpd.GeoDataFrame - Filtered set of waterbody polygons with major rivers polygons removed. - + Polygon | MultiPolygon + Polygon or MultiPolygon with no holes. """ - crs = waterbody_polygons.crs - - if major_rivers_mask_fp: - _log.info("Filtering out major rivers polygons from the waterbody polygons...") - try: - major_rivers = gpd.read_file(major_rivers_mask_fp).to_crs(crs) - except Exception as error: - _log.exception(f"Could not read file {major_rivers_mask_fp}") - raise error + if isinstance(geom, MultiPolygon): + # For each polygon in the MultiPolygon, + # close the polygon holes. + closed_polygons = [remove_polygon_interiors(g) for g in geom.geoms] + # Get the largest polygon. + largest_polygon = get_largest_polygon(closed_polygons) + # Get the polygons not within the largest polygon. + outside_largest_polygon = [ + poly for poly in closed_polygons if not poly.within(largest_polygon) + ] + + if outside_largest_polygon: + return MultiPolygon([largest_polygon, *outside_largest_polygon]) else: - major_rivers_filtered_polygons = filter_by_intersection( - gpd_data=waterbody_polygons, - gpd_filter=major_rivers, - filtertype="intersects", - invert_mask=True, - return_inverse=False, - ) - _log.info( - f"Filtered out {len(waterbody_polygons) - len(major_rivers_filtered_polygons)} waterbody polygons." - ) - return major_rivers_filtered_polygons - else: - _log.info("Skipping filtering out major rivers polygons step.") - return waterbody_polygons + return largest_polygon + elif isinstance(geom, Polygon): + return remove_polygon_interiors(geom) -def pp_test_gdf(input_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: +def remove_polygons_within_polygons(polygons_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ - Function to calculate the Polsby–Popper test values on a - geopandas GeoDataFrame. + Remove polygons within other polygons. Parameters ---------- - input_gdf : gpd.GeoDataFrame - Polygons to calculate the Polsby–Popper test values for. + polygons_gdf : gpd.GeoDataFrame + Set of polygons to filter. Returns ------- gpd.GeoDataFrame - Polygons GeoDataFrame with a column `pp_test` containing the Polsby–Popper test values - for each polygon. + Input polygons with polygons contained in other polygons removed. """ - crs = input_gdf.crs - assert crs.is_projected - - input_gdf["area"] = input_gdf["geometry"].area - input_gdf["perimeter"] = input_gdf["geometry"].length - input_gdf["pp_test"] = (4 * math.pi * input_gdf["area"]) / (input_gdf["perimeter"] ** 2) - - return input_gdf + _log.info(f"Initial polygon count {len(polygons_gdf)}") + polygons_to_delete = [] + for row in polygons_gdf.itertuples(): + row_id = row.Index + row_geom = row.geometry -def split_large_polygons( - waterbody_polygons: gpd.GeoDataFrame, pp_thresh: float = 0.005, method: str = "nothing" -) -> gpd.GeoDataFrame: - """ - Function to split large polygons. + polygons_to_check_against = polygons_gdf.loc[polygons_gdf.index != row_id] - Parameters - ---------- - waterbody_polygons : gpd.GeoDataFrame - Set of polygons for which to split the large polygons. - pp_thresh : float, optional - Threshold for the Polsby–Popper test values of the polygons by which to - classify is a polygon is large or not, by default 0.005 - method : str, optional - Method to use to split large polygons., by default "nothing" + # Check if the row geometry is within any of the other polygons. + if polygons_to_check_against.geometry.contains(row_geom).any(): + polygons_to_delete.append(row_id) - Returns - ------- - gpd.GeoDataFrame - Set of polygons with large polygons split. - """ - - warnings.simplefilter(action="ignore", category=FutureWarning) + if polygons_to_delete: + polygons_to_delete_gdf = polygons_gdf.loc[polygons_gdf.index.isin(polygons_to_delete)] + _log.info(f"Found {len(polygons_to_delete_gdf)} polygons within polygons.") - # Confirm the option to use. - valid_options = ["erode-dilate-v1", "erode-dilate-v2", "nothing"] - if method not in valid_options: + polygons_within_polygons_removed = polygons_gdf.loc[ + ~polygons_gdf.index.isin(polygons_to_delete) + ] _log.info( - f"{method} method not implemented to handle large polygons. Defaulting to not splitting large polygons." + f"Polygon count after removing polygons within polygons {len(polygons_within_polygons_removed)}." ) - method = "nothing" - - crs = waterbody_polygons.crs - assert crs.is_projected - # Calculate the Polsby–Popper values. - waterbody_polygons_ = pp_test_gdf(input_gdf=waterbody_polygons) + return polygons_within_polygons_removed - # Split large polygons. - if method == "nothing": - info_msg = ( - "You have chosen not to split large polygons. If you meant to use this option, please " - f"select one of the following methods: {valid_options[:2]}." - ) - _log.info(info_msg) - return waterbody_polygons_ else: - _log.info( - f"Splitting large polygons using the `{method}` method, using the threshold {pp_thresh}." - ) - if method == "erode-dilate-v1": - splittable_polygons = waterbody_polygons_[waterbody_polygons_.pp_test <= pp_thresh] - not_splittable_polygons = waterbody_polygons_[waterbody_polygons_.pp_test > pp_thresh] - - splittable_polygons_buffered = splittable_polygons.buffer(-50) - split_polygons = ( - splittable_polygons_buffered.explode(index_parts=True) - .reset_index(drop=True) - .buffer(50) - ) - split_polygons_gdf = gpd.GeoDataFrame(geometry=split_polygons, crs=crs) - split_polygons_gdf = pp_test_gdf(input_gdf=split_polygons_gdf) - - large_polygons_handled = pd.concat( - [not_splittable_polygons, split_polygons_gdf], ignore_index=True - ) - return large_polygons_handled - elif method == "erode-dilate-v2": - splittable_polygons = waterbody_polygons_[waterbody_polygons_.pp_test <= pp_thresh] - not_splittable_polygons = waterbody_polygons_[waterbody_polygons_.pp_test > pp_thresh] - - if len(splittable_polygons) >= 1: - splittable_polygons_buffered = splittable_polygons.buffer(-100) - splittable_polygons_buffered = splittable_polygons_buffered.buffer(125) - - splittable_polygons_buffered_union = gpd.GeoDataFrame( - geometry=[splittable_polygons_buffered.unary_union], crs=crs - ) - subtracted = ( - gpd.overlay( - splittable_polygons, splittable_polygons_buffered_union, how="difference" - ) - .explode(index_parts=True) - .reset_index(drop=True) - ) - resubtracted = ( - gpd.overlay(splittable_polygons, subtracted, how="difference") - .explode(index_parts=True) - .reset_index(drop=True) - ) - - # Assign each chopped-off bit of the polygon to its nearest big - # neighbour. - unassigned = np.ones(len(subtracted), dtype=bool) - recombined = [] - - for row in resubtracted.itertuples(): - mask = subtracted.exterior.intersects(row.geometry.exterior) & unassigned - neighbours = subtracted[mask] - unassigned[mask] = False - poly = row.geometry.union(neighbours.unary_union) - recombined.append(poly) - - recombined_gdf = gpd.GeoDataFrame(geometry=recombined, crs=crs) - # Get only the actual geometry objects that are neither missing nor empty - recombined_gdf_masked = recombined_gdf[ - ~(recombined_gdf.geometry.is_empty | recombined_gdf.geometry.isna()) - ] - # All remaining polygons are not part of a big polygon. - results = pd.concat( - [recombined_gdf_masked, subtracted[unassigned], not_splittable_polygons], - ignore_index=True, - ) - - results = pp_test_gdf(input_gdf=results) - - large_polygons_handled = results.explode(index_parts=True).reset_index(drop=True) - return large_polygons_handled - else: - info_msg = ( - f"There are no polygons with a Polsby–Popper score above the {pp_thresh}. " - "No polygons were split." - ) - _log.info(info_msg) - return waterbody_polygons_ - - -def filter_waterbodies( - primary_threshold_polygons: gpd.GeoDataFrame, - secondary_threshold_polygons: gpd.GeoDataFrame, - min_polygon_size: float = 4500, - max_polygon_size: float = math.inf, - land_sea_mask_fp: str | Path = "", - urban_mask_fp: str | Path = "", - major_rivers_mask_fp: str | Path = "", - handle_large_polygons: str = "nothing", - pp_test_threshold: float = 0.005, + _log.info("Found no polygons within polygons.") + return polygons_gdf + + +def filter_by_length( + polygons_gdf: gpd.GeoDataFrame, length_threshold_km: float = 150 ) -> gpd.GeoDataFrame: """ - Apply filters to the primary and secondary threshold waterbody - polygons. + Filter out polygons whose length is greater than the length threshold. Parameters ---------- - primary_threshold_polygons : gpd.GeoDataFrame, optional - Waterbody polygons generated using the primary threshold. - secondary_threshold_polygons : gpd.GeoDataFrame, optional - Waterbody polygons generated using the secondary threshold. - min_polygon_size : float, optional - Minimum area of a waterbody polygon to be included in the output polygons, by default 4500 - max_polygon_size : float, optional - Maximum area of a waterbody polygon to be included in the output polygons, by default math.inf - land_sea_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out ocean waterbody polygons, by default "" - urban_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out CBDs, by default "" - major_rivers_mask_fp : str | Path, optional - Vector file path to the polygons to use to filter out major river waterbody polygons, by default "" - handle_large_polygons : str, optional - Method to use to split large water body polygons, by default "nothing" - pp_test_threshold : float, optional - Polsby-Popper test value to use when splitting large polygons using the method specified in `handle_large_polygons`, by default 0.005 + polygons_gdf : gpd.GeoDataFrame + Polygons to filter. + length_threshold_km : float, optional + Length threshold in kilometers by which to filter out large polygons, by default 150 Returns ------- gpd.GeoDataFrame - Filtered set of waterbody polygons. + Polygons with large polygons filtered out. """ - _log.info(f"Primary threshold polygons count {len(primary_threshold_polygons)}.") - _log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons)}.") - - ( - area_filtered_primary_threshold_polygons, - area_filtered_secondary_threshold_polygons, - ) = filter_by_area( - primary_threshold_polygons=primary_threshold_polygons, - secondary_threshold_polygons=secondary_threshold_polygons, - min_polygon_size=min_polygon_size, - max_polygon_size=max_polygon_size, - ) - - ( - inland_primary_threshold_polygons, - inland_secondary_threshold_polygons, - ) = filter_using_land_sea_mask( - primary_threshold_polygons=area_filtered_primary_threshold_polygons, - secondary_threshold_polygons=area_filtered_secondary_threshold_polygons, - land_sea_mask_fp=land_sea_mask_fp, - ) - - ( - cbd_filtered_primary_threshold_polygons, - cbd_filtered_secondary_threshold_polygons, - ) = filter_using_urban_mask( - primary_threshold_polygons=inland_primary_threshold_polygons, - secondary_threshold_polygons=inland_secondary_threshold_polygons, - urban_mask_fp=urban_mask_fp, - ) + length_threshold_m = length_threshold_km * 1000 - merged_polygons = merge_primary_and_secondary_threshold_polygons( - primary_threshold_polygons=cbd_filtered_primary_threshold_polygons, - secondary_threshold_polygons=cbd_filtered_secondary_threshold_polygons, - ) + filtered_polygons_gdf = polygons_gdf[polygons_gdf["length_m"] <= length_threshold_m] - major_rivers_filtered_polygons = filter_using_major_rivers_mask( - waterbody_polygons=merged_polygons, major_rivers_mask_fp=major_rivers_mask_fp + _log.info( + f"Filtered out {len(polygons_gdf) - len(filtered_polygons_gdf)} polygons out of {len(polygons_gdf)} polygons." ) - large_polygons_handled = split_large_polygons( - waterbody_polygons=major_rivers_filtered_polygons, - pp_thresh=pp_test_threshold, - method=handle_large_polygons, - ) - - # Reapply the size filtering, just to check that all of the split and filtered waterbodies are - # still in the size range we want. - area_filtered_large_polygons_handled, _ = filter_by_area( - primary_threshold_polygons=large_polygons_handled, - secondary_threshold_polygons=None, - min_polygon_size=min_polygon_size, - max_polygon_size=max_polygon_size, - ) + filtered_polygons_gdf = gpd.GeoDataFrame(data=filtered_polygons_gdf) - # Return a GeoDataFrame with the geometry column only. - filtered_polygons = gpd.GeoDataFrame( - geometry=area_filtered_large_polygons_handled["geometry"], - crs=area_filtered_large_polygons_handled.crs, - ) - - return filtered_polygons - - -def filter_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray: - """ - Function to filter the HydroSHEDs Land Mask into a boolean mask. - """ - # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data. - boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2) - return boolean_mask + return filtered_polygons_gdf diff --git a/deafrica_waterbodies/group_polygons.py b/deafrica_waterbodies/group_polygons.py new file mode 100644 index 00000000..5c954442 --- /dev/null +++ b/deafrica_waterbodies/group_polygons.py @@ -0,0 +1,138 @@ +import logging +import os + +import fsspec +import geopandas as gpd +import numpy as np +import pandas as pd + +from deafrica_waterbodies.id_field import guess_id_field +from deafrica_waterbodies.io import check_dir_exists, check_if_s3_uri + +_log = logging.getLogger(__name__) + + +def get_intersecting_polygons_ids( + region: gpd.GeoDataFrame, polygons_gdf: gpd.GeoDataFrame +) -> gpd.GeoDataFrame: + """ + Get the IDs of the polygons that intersect with a region. + + Parameters + ---------- + region : gpd.GeoDataFrame + A single row GeoDataFrame of the product region of interest. + polygons_gdf : gpd.GeoDataFrame + A set of polygons to filter by intersection with the region. + + Returns + ------- + gpd.GeoDataFrame + The single row GeoDataFrame of the product region of interest with a + column containing the ids of the polygons that intersect with the region. + """ + assert len(region) == 1 + + intersecting_polygons_ids = gpd.sjoin( + polygons_gdf, region, how="inner", predicate="intersects" + ).index.to_list() + region["intersecting_polygons_ids"] = ",".join(intersecting_polygons_ids) + + return region + + +def export_polygons( + region: pd.Series, polygons_gdf: gpd.GeoDataFrame, output_directory: str +) -> str: + """ + Export the set of polygons for a region as a parquet file. + + Parameters + ---------- + region : pd.Series + The row in a DataFrame representing a region. + polygons_gdf : gpd.GeoDataFrame + The set of polygons to select from. + output_directory : str + The output directory to write the output parquet file to. + + Returns + ------- + str + The file path of the output parquet file. + """ + region_id = region.name + polygon_ids = region.intersecting_polygons_ids.split(",") + + output_fp = os.path.join(output_directory, f"{region_id}.parquet") + + polygons_gdf.loc[polygon_ids].reset_index().to_parquet(output_fp) + + _log.info(f"Polygons for region {region_id} written to {output_fp}") + + return output_fp + + +def split_polygons_by_region( + polygons_gdf: gpd.GeoDataFrame, + output_directory: str, + product: str = "wofs_ls", +) -> dict: + """ + Split a set of polygons by the regions in a DE Africa's product regions + GeoJSON file. + + Parameters + ---------- + polygons_gdf : gpd.GeoDataFrame + The set of polygons to split by region. + output_directory : str + The directory to write the parquet files for the GeoDataFrames + from the split by regions. + product : str, optional + The DE Africa product to use to get the regions and region codes, by default "wofs_ls" + + Returns + ------- + dict + A dictionary of the region codes and the file path to the polygons that + intersect with the region. + """ + id_field = guess_id_field(polygons_gdf) + _log.info(f"Guessed ID field: {id_field}") + + # Set the ID field as the index. + polygons_gdf.set_index(id_field, inplace=True) + + # Load the regions file. + product_regions_fp = f"https://explorer.digitalearth.africa/api/regions/{product}" + product_regions = gpd.read_file(product_regions_fp).to_crs(polygons_gdf.crs) + product_regions.set_index("region_code", inplace=True) + + # Split each row in the product_regions into a GeoDataFrame of its own. + regions = np.array_split(product_regions, len(product_regions)) + assert len(regions) == len(product_regions) + + # For each region get the IDs for the polygons that intersect with the region. + regions_ = [get_intersecting_polygons_ids(region, polygons_gdf) for region in regions] + + # Filter to remove regions with no intersecting polygons. + filtered_regions = [region for region in regions_ if region.iloc[0].intersecting_polygons_ids] + + filtered_regions_gdf = pd.concat(filtered_regions, ignore_index=False) + + if not check_dir_exists(output_directory): + if check_if_s3_uri(output_directory): + fs = fsspec.filesystem("s3") + else: + fs = fsspec.filesystem("file") + + fs.mkdirs(output_directory, exist_ok=True) + _log.info(f"Created directory {output_directory}") + + # Export each regions' polygons as a parquet file. + filtered_regions_gdf["polygon_file_paths"] = filtered_regions_gdf.apply( + lambda row: export_polygons(row, polygons_gdf, output_directory), axis=1 + ) + + return filtered_regions_gdf["polygon_file_paths"].to_dict() diff --git a/deafrica_waterbodies/io.py b/deafrica_waterbodies/io.py index 3a1260fd..2280bccc 100644 --- a/deafrica_waterbodies/io.py +++ b/deafrica_waterbodies/io.py @@ -183,8 +183,8 @@ def upload_file_to_s3( def write_waterbodies_to_file( waterbodies_gdf: gpd.GeoDataFrame, - product_version: str, output_directory: str | Path, + file_name_prefix: str = "waterbodies", ): """ Function to write waterbody polygons to an ESRI Shapefile. @@ -193,13 +193,12 @@ def write_waterbodies_to_file( ---------- waterbodies_gdf : gpd.GeoDataFrame The waterbody polygons. - product_version: str, - The DE Africa Waterbodies service product version. output_directory : str | Path, S3 URI or File URI of the directory to write the waterbody polygons to. - + file_name_prefix: str, optional + Prefix to use when naming the output file(s). """ - output_fn = f"waterbodiesv{product_version.replace('.', '-')[0]}.shp" + output_fn = f"{file_name_prefix}.shp" output_fp = os.path.join(output_directory, output_fn) if check_if_s3_uri(output_directory): @@ -208,7 +207,7 @@ def write_waterbodies_to_file( # Get the bucket name and object prefix. output_bucket_name = urllib.parse.urlparse(output_directory).netloc - output_object_prefix = urllib.parse.urlparse(output_directory).path.lstrip("/") + output_object_prefix = urllib.parse.urlparse(output_directory).path.lstrip("/").rstrip("/") # Make a temporary folder locally. fs = fsspec.filesystem("file") @@ -283,3 +282,36 @@ def find_parquet_files(path: str | Path, pattern: str = ".*") -> [str]: pq_file_paths = [f"s3://{file}" for file in pq_file_paths] return pq_file_paths + + +def convert_shapefile_2_parquet(shapefile_fp: str | Path): + """ + Convert a shapefile to a parquet file. + + Parameters + ---------- + shapefile_fp : str | Path + File path or S3 URI of the shapefile to convert. + + """ + shapefile_fp = str(shapefile_fp) + + # Get the parent directory of the shapefile. + dir_name = os.path.dirname(shapefile_fp) + # Get the file name of the shapefile without the file extenstion. + base_name = os.path.splitext(os.path.basename(shapefile_fp))[0] + + # Get the parquet file path. + parquet_fp = os.path.join(dir_name, f"{base_name}.parquet") + + # Read the shapefile. + try: + shapefile_gdf = gpd.read_file(shapefile_fp) + except Exception as error: + _log.exception(f"Could not read file {shapefile_fp}") + _log.error(error) + raise error + + # Save the GeoDataFrame to a parquet file. + shapefile_gdf.to_parquet(parquet_fp) + _log.info(f"Saved to {parquet_fp}") diff --git a/deafrica_waterbodies/make_polygons.py b/deafrica_waterbodies/make_polygons.py index 65225827..1ef433e0 100644 --- a/deafrica_waterbodies/make_polygons.py +++ b/deafrica_waterbodies/make_polygons.py @@ -9,111 +9,69 @@ import logging from pathlib import Path -from typing import Callable +from types import ModuleType import datacube import datacube.model import geopandas as gpd import numpy as np import pandas as pd +import scipy.ndimage as ndi import shapely -from datacube.testutils.io import rio_slurp_xarray +import xarray as xr from deafrica_tools.spatial import xr_vectorize +from skimage import measure, morphology +from skimage.segmentation import watershed -from deafrica_waterbodies.filters import filter_by_intersection, filter_hydrosheds_land_mask +from deafrica_waterbodies.filters import filter_by_intersection _log = logging.getLogger(__name__) -def check_wetness_thresholds(minimum_wet_thresholds: list) -> str: +def set_wetness_thresholds( + detection_threshold: int | float = 0.1, extent_threshold: int | float = 0.05 +) -> list: """ - Function to validate the wetness thresholds. + Function to set and validate the minimum frequency for water a pixel must have + to be included. Parameters ---------- - minimum_wet_thresholds : list - A list containing the primary and secondary thresholds, with the secondary - threshold listed first. + detection_threshold : int | float + Threshold used to set the location of the waterbody polygons. + extent_threshold : int | float + Threshold used to set the shape/extent of the waterbody polygons. Returns ------- - str - Validation message - + list + A list containing the extent and detection thresholds with the extent + threshold listed first. """ - # Test whether the wetness threshold has been correctly set. - - if minimum_wet_thresholds[0] > minimum_wet_thresholds[-1]: - _log.error("Primary threshold value is less than the secondary threshold.") - error_msg = ( - "We will be running a hybrid wetness threshold. " - "Please ensure that the primary threshold has a higher value than the " - "secondary threshold. \n" + # Check for correct value type. + assert detection_threshold is not None + assert extent_threshold is not None + assert isinstance(detection_threshold, float) or isinstance(detection_threshold, int) + assert isinstance(extent_threshold, float) or isinstance(extent_threshold, int) + + # Check values. + assert 0 <= detection_threshold <= 1 + assert 0 <= extent_threshold <= 1 + + if extent_threshold > detection_threshold: + _log.error( + f"Detection threshold {detection_threshold} is less than the extent threshold {extent_threshold}." ) + error_msg = """We will be running a hybrid wetness threshold. + Please ensure that the detection threshold has a higher value than the extent threshold.""" raise ValueError(error_msg) else: - print_msg = ( - "We will be running a hybrid wetness threshold. \n" - f"**You have set {minimum_wet_thresholds[-1]} as the " - "primary threshold, which will define the location of the waterbody " - f"polygons \n with {minimum_wet_thresholds[0]} set as the supplementary " - "threshold, which will define the extent/shape of the waterbody polygons.**" + _log.info( + f"""We will be running a hybrid wetness threshold. + You have set {detection_threshold} as the location threshold, which will define the location of the waterbody polygons. + You have set {extent_threshold} as the extent threshold, which will define the extent/shape of the waterbody polygons.""" ) - return print_msg - - -def merge_polygons_at_dataset_boundaries(waterbody_polygons: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Function to merge waterbody polygons located at WOfS All Time Summary dataset boundaries. - - Parameters - ---------- - waterbody_polygons : gpd.GeoDataFrame - The waterbody polygons. - - Returns - ------- - gpd.GeoDataFrame - Waterbody polygons with polygons located at WOfS All Time Summary dataset boundaries merged. - """ - # Get the dataset extents/regions for the WOfS All Time Summary product. - ds_extents = gpd.read_file( - "https://explorer.digitalearth.africa/api/regions/wofs_ls_summary_alltime" - ).to_crs(waterbody_polygons.crs) - - # Add a 1 pixel (30 m) buffer to the dataset extents. - buffered_30m_ds_extents_geom = ds_extents.boundary.buffer( - 30, cap_style="flat", join_style="mitre" - ) - buffered_30m_ds_extents = gpd.GeoDataFrame( - geometry=buffered_30m_ds_extents_geom, crs=waterbody_polygons.crs - ) - - # Get the polygons at the dataset boundaries. - boundary_polygons, not_boundary_polygons = filter_by_intersection( - gpd_data=waterbody_polygons, - gpd_filter=buffered_30m_ds_extents, - invert_mask=False, - return_inverse=True, - ) - - # Now combine overlapping polygons in boundary_polygons. - merged_boundary_polygons_geoms = shapely.ops.unary_union(boundary_polygons["geometry"]) - - # `Explode` the multipolygon back out into individual polygons. - merged_boundary_polygons = gpd.GeoDataFrame( - crs=waterbody_polygons.crs, geometry=[merged_boundary_polygons_geoms] - ) - merged_boundary_polygons = merged_boundary_polygons.explode(index_parts=True).reset_index( - drop=True - ) - - # Then combine our merged_boundary_polygons with the not_boundary_polygons. - all_polygons = gpd.GeoDataFrame( - pd.concat([not_boundary_polygons, merged_boundary_polygons], ignore_index=True, sort=True) - ).set_geometry("geometry") - - return all_polygons + return [extent_threshold, detection_threshold] def merge_polygons_at_tile_boundaries( @@ -126,7 +84,7 @@ def merge_polygons_at_tile_boundaries( ---------- waterbody_polygons : gpd.GeoDataFrame The waterbody polygons. - tile_exents_gdf: gpd.GeoDataFrame + tile_extents_gdf: gpd.GeoDataFrame The extents of the tiles used to generate the waterbody polygons. Returns @@ -172,70 +130,75 @@ def merge_polygons_at_tile_boundaries( return all_polygons -def get_polygons_from_dataset( - dataset_id: str, - dask_chunks: dict[str, int] = {"x": 3200, "y": 3200, "time": 1}, - resolution: tuple[int, int] = (-30, 30), - output_crs: str = "EPSG:6933", - min_valid_observations: int = 128, - primary_threshold: float = 0.1, - secondary_threshold: float = 0.05, - dc: datacube.Datacube | None = None, -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +def load_wofs_frequency( + tile: tuple[tuple[int, int], datacube.api.grid_workflow.Tile], + grid_workflow: datacube.api.GridWorkflow, + plugin: ModuleType, + dask_chunks: dict[str, int] = { + "x": 3200, + "y": 3200, + "time": 1, + }, # based on Landsat WOfS scene size + min_valid_observations: int = 60, + min_wet_thresholds: list[int | float] = [0.05, 0.1], + land_sea_mask_fp: str | Path = "", +) -> tuple[xr.DataArray, xr.DataArray]: """ - Generate water body polygons by thresholding a WOfS All Time Summary dataset. + Load the WOfS All-Time Summary frequency measurement for a tile and threshold the data + using the extent and the detection thresholds. Parameters ---------- - dataset_id : str - The dataset id of a WOfs All Time summary dataset for which to + tile : tuple[tuple[int,int], datacube.api.grid_workflow.Tile] + The WOfS All Time Summary Tile object for which to generate waterbody polygons for. + grid_workflow: datacube.api.GridWorkflow, + Grid Workflow used to generate the tiles and to be used to load the Tile object. + plugin: ModuleType + A validated plugin to load filtering masks with. dask_chunks : dict, optional dask_chunks to use to load WOfS data, by default {"x": 3200, "y": 3200, "time": 1} - resolution : tuple[int, int], optional - Resolution to use to load WOfS data, by default (-30, 30) - output_crs : str, optional - CRS to load data and for the output waterbody polygons, by default "EPSG:6933" min_valid_observations : int, optional - Threshold to use to mask out pixels based on the number of valid WOfS observations for each pixel, by default 128 - primary_threshold : float, optional - Threshold to use to determine the location of the waterbody polygons, by default 0.1 - secondary_threshold : float, optional - Threshold to use to determine the extent / shape of the waterbodies polygons, by default 0.05 - dc : datacube.Datacube | None, optional - Datacube connection, by default None + Threshold to use to mask out pixels based on the number of valid WOfS + observations for each pixel, by default 60 + min_wet_thresholds: list[int | float], optional + A list containing the extent threshold and the detection threshold, with + the extent threshold listed first, by default [0.05, 0.1] + land_sea_mask_fp: str | Path, optional + File path to raster to use to mask ocean pixels in WOfS data, by default "" Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame] - A tuple containing GeoDataFrames of waterbody polygons generated from thresholding WOfS All Time Summary data - using the primary and secondary thresholds. + tuple[valid_detection: xr.DataArray, valid_extent: xr.DataArray] + WOfS All Time Summary frequency measurement thresholded using the detection and extent + thresholds. """ - # Set up the primary and secondary thresholds. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] + if land_sea_mask_fp: + assert plugin is not None - # Create a datacube query object. - query = dict( - dask_chunks=dask_chunks, - resolution=resolution, - output_crs=output_crs, - ) + # Set up the detection and extent thresholds. + extent_threshold = min_wet_thresholds[0] + detection_threshold = min_wet_thresholds[-1] - # Connect to the datacube. - if dc is None: - dc = datacube.Datacube(app="GenerateWaterbodyPolygons") - - # Get the dataset. - dataset = dc.index.datasets.get(dataset_id) + # Get the tile id and tile object. + tile_id = tile[0] # noqa F841 + tile_object = tile[1] - # Generate the waterbody polygons using the primary and secondary thresholds, - # from the dataset. + # Load the WOfS All-Time Summary data for the tile and threshold the data + # using the extent and the detection thresholds. try: - _log.info(f"Generating water body polygons for dataset {dataset_id}") + # Load the data for the tile. + wofs_alltime_summary = grid_workflow.load(tile_object, dask_chunks=dask_chunks).squeeze() + + # Load the land sea mask. + if land_sea_mask_fp: + boolean_land_sea_mask = plugin.load_land_sea_mask( + land_sea_mask_fp=land_sea_mask_fp, wofs_alltime_summary_ds=wofs_alltime_summary + ) - # Load the WOfS All-Time Summary dataset. - wofs_alltime_summary = dc.load(datasets=[dataset], **query).squeeze() + # Mask the WOfS All-Time Summary dataset using the boolean land sea mask. + wofs_alltime_summary = wofs_alltime_summary.where(boolean_land_sea_mask) # Set the no-data values to nan. # Masking here is done using the frequency measurement because for multiple @@ -251,456 +214,267 @@ def get_polygons_from_dataset( wofs_alltime_summary.count_clear >= min_valid_observations ) - # Generate the polygons. - generated_polygons = {} - for threshold in minimum_wet_thresholds: - # Mask any pixels whose frequency of water detection is less than the threshold. - wofs_alltime_summary_valid_wetness = wofs_alltime_summary.frequency > threshold + # Threshold using the detection threshold. + detection = wofs_alltime_summary.frequency > detection_threshold + valid_detection = (detection > 0) & wofs_alltime_summary_valid_clear_count - # Now find pixels that meet both the minimum valid observations - # and minimum wet threshold criteria. - wofs_alltime_summary_valid = wofs_alltime_summary_valid_wetness.where( - wofs_alltime_summary_valid_wetness & wofs_alltime_summary_valid_clear_count - ) - - # Convert the raster to polygons. - # We use a mask of '1' to only generate polygons around values of '1' (not NaNs). - polygons_mask = wofs_alltime_summary_valid == 1 - - polygons = xr_vectorize( - wofs_alltime_summary_valid, - mask=polygons_mask, - crs=wofs_alltime_summary.geobox.crs, - ) - - # Combine any overlapping polygons. - merged_polygon_geoms = shapely.ops.unary_union(polygons["geometry"]) - - # Turn the combined multipolygon back into a GeoDataFrame. - try: - merged_polygons = gpd.GeoDataFrame(geometry=list(merged_polygon_geoms.geoms)) - except AttributeError: - merged_polygons = gpd.GeoDataFrame(geometry=[merged_polygon_geoms]) - - # We need to add the crs back onto the GeoDataFrame. - merged_polygons.crs = wofs_alltime_summary.geobox.crs - - generated_polygons[threshold] = merged_polygons + # Threshold the using the extent threshold. + extent = wofs_alltime_summary.frequency > extent_threshold + valid_extent = (extent > 0) & wofs_alltime_summary_valid_clear_count except Exception as error: - _log.exception( - f"\nDataset {str(dataset_id)} did not run. \n" - "This is probably because there are no waterbodies present in this dataset." - ) _log.exception(error) - - primary_threshold_polygons = generated_polygons[primary_threshold] - secondary_threshold_polygons = generated_polygons[secondary_threshold] - - return primary_threshold_polygons, secondary_threshold_polygons + raise error + else: + return valid_detection, valid_extent -def get_polygons_from_dataset_with_land_sea_mask_filtering( - dataset_id: str, - dask_chunks: dict[str, int] = {"x": 3200, "y": 3200, "time": 1}, - resolution: tuple[int, int] = (-30, 30), - output_crs: str = "EPSG:6933", - min_valid_observations: int = 128, - primary_threshold: float = 0.1, - secondary_threshold: float = 0.05, - dc: datacube.Datacube | None = None, - land_sea_mask_fp: str | Path = "", - resampling_method: str = "bilinear", - filter_land_sea_mask: Callable = filter_hydrosheds_land_mask, -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +def remove_small_waterbodies(waterbody_raster: np.ndarray, min_size: int = 6) -> np.ndarray: """ - Generate water body polygons by thresholding a WOfS All Time Summary dataset. - Use a raster land/sea mask to mask out ocean pixels from the WOfS data before - vectorizing the polygons. + Remove water bodies from the raster that are smaller than the specified number of pixels. Parameters ---------- - dataset_id : str - The dataset id of a WOfs All Time summary dataset for which to - generate waterbody polygons for. - dask_chunks : dict, optional - dask_chunks to use to load WOfS data, by default {"x": 3200, "y": 3200, "time": 1} - resolution : tuple[int, int], optional - Resolution to use to load WOfS data, by default (-30, 30) - output_crs : str, optional - CRS to load data and for the output waterbody polygons, by default "EPSG:6933" - min_valid_observations : int, optional - Threshold to use to mask out pixels based on the number of valid WOfS observations for each pixel, by default 128 - primary_threshold : float, optional - Threshold to use to determine the location of the waterbody polygons, by default 0.1 - secondary_threshold : float, optional - Threshold to use to determine the extent / shape of the waterbodies polygons, by default 0.05 - dc : datacube.Datacube | None, optional - Datacube connection, by default None - land_sea_mask_fp: str | Path, optional - File path to raster to use to mask ocean pixels in WOfS data, by default "" - resampling_method: str, optional - Resampling method to use when loading the land sea mask raster, by default "bilinear" - filter_land_sea_mask: Callable, optional - Function to apply to the land sea mask xr.DataArray to generate a boolean - mask where pixels with a value of True are land pixels and pixels with a - value of False are ocean pixels, by default `filter_hydrosheds_land_mask` + waterbody_raster : np.ndarray + Raster image to filter. + min_size : int, optional + The smallest allowable waterbody size i.e. minimum number of pixels a waterbody must have, by default 6 Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame] - A tuple containing GeoDataFrames of waterbody polygons generated from thresholding WOfS All Time Summary data - using the primary and secondary thresholds. - + np.ndarray + Raster image with small waterbodies removed. """ - # Set up the primary and secondary thresholds. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] - # Create a datacube query object. - query = dict( - dask_chunks=dask_chunks, - resolution=resolution, - output_crs=output_crs, + waterbodies_labelled = morphology.label(waterbody_raster, background=0) + waterbodies_small_removed = morphology.remove_small_objects( + waterbodies_labelled, min_size=min_size, connectivity=1 ) - # Connect to the datacube. - if dc is None: - dc = datacube.Datacube(app="GenerateWaterbodyPolygons") - - # Get the dataset. - dataset = dc.index.datasets.get(dataset_id) - - # Generate the waterbody polygons using the primary and secondary thresholds, - # from the dataset. - try: - _log.info(f"Generating water body polygons for dataset {dataset_id}") + return waterbodies_small_removed - # Load the WOfS All-Time Summary dataset. - wofs_alltime_summary = dc.load(datasets=[dataset], **query).squeeze() - # Load the land sea mask. - if land_sea_mask_fp: - land_sea_mask = rio_slurp_xarray( - fname=land_sea_mask_fp, - gbox=wofs_alltime_summary.geobox, - resampling=resampling_method, - ) - - # Filter the land sea mask. - boolean_land_sea_mask = filter_land_sea_mask(land_sea_mask) - - # Mask the WOfS All-Time Summary dataset using the boolean land sea mask. - wofs_alltime_summary = wofs_alltime_summary.where(boolean_land_sea_mask) +def select_waterbodies_for_segmentation( + waterbodies_labelled: np.ndarray, min_size: int = 1000 +) -> np.ndarray: + """ + Select waterbodies larger than the specified number of pixels for segmentation. - # Set the no-data values to nan. - # Masking here is done using the frequency measurement because for multiple - # areas NaN values are present in the frequency measurement but the - # no data value -999 is not present in the count_clear and - # count_wet measurements. - # Note: it seems some pixels with NaN values in the frequency measurement - # have a value of zero in the count_clear and/or the count_wet measurements. - wofs_alltime_summary = wofs_alltime_summary.where(~np.isnan(wofs_alltime_summary.frequency)) + Parameters + ---------- + waterbodies_labelled : np.ndarray + Raster image to filter. + min_size : int, optional + Minimum number of pixels to classify a waterbody as large, by default 1000 - # Mask pixels not observed at least min_valid_observations times. - wofs_alltime_summary_valid_clear_count = ( - wofs_alltime_summary.count_clear >= min_valid_observations - ) + Returns + ------- + np.ndarray + Raster containing the large waterbodies to be segmented. + """ - # Generate the polygons. - generated_polygons = {} - for threshold in minimum_wet_thresholds: - # Mask any pixels whose frequency of water detection is less than the threshold. - wofs_alltime_summary_valid_wetness = wofs_alltime_summary.frequency > threshold + props = measure.regionprops(waterbodies_labelled) - # Now find pixels that meet both the minimum valid observations - # and minimum wet threshold criteria. - wofs_alltime_summary_valid = wofs_alltime_summary_valid_wetness.where( - wofs_alltime_summary_valid_wetness & wofs_alltime_summary_valid_clear_count - ) + labels_to_keep = [] + for region_prop in props: + count = region_prop.num_pixels + label = region_prop.label - # Convert the raster to polygons. - # We use a mask of '1' to only generate polygons around values of '1' (not NaNs). - polygons_mask = wofs_alltime_summary_valid == 1 + if count > min_size: + labels_to_keep.append(label) - polygons = xr_vectorize( - wofs_alltime_summary_valid, - mask=polygons_mask, - crs=wofs_alltime_summary.geobox.crs, - ) + segment_image = np.where(np.isin(waterbodies_labelled, labels_to_keep), 1, 0) - # Combine any overlapping polygons. - merged_polygon_geoms = shapely.ops.unary_union(polygons["geometry"]) + return segment_image - # Turn the combined multipolygon back into a GeoDataFrame. - try: - merged_polygons = gpd.GeoDataFrame(geometry=list(merged_polygon_geoms.geoms)) - except AttributeError: - merged_polygons = gpd.GeoDataFrame(geometry=[merged_polygon_geoms]) - # We need to add the crs back onto the GeoDataFrame. - merged_polygons.crs = wofs_alltime_summary.geobox.crs +def generate_segmentation_markers( + marker_source: np.ndarray, erosion_radius: int = 1, min_size: int = 100 +) -> np.ndarray: + """ + Create watershed segmentation markers. - generated_polygons[threshold] = merged_polygons + Parameters + ---------- + marker_source : np.ndarray + Raster image to generate watershed segmentation markers from. + erosion_radius : int, optional + Radius to use to generate footprint for erosion., by default 1 + min_size : int, optional + The smallest allowable object size, by default 100 - except Exception as error: - _log.exception( - f"\nDataset {str(dataset_id)} did not run. \n" - "This is probably because there are no waterbodies present in this dataset." - ) - _log.exception(error) + Returns + ------- + np.ndarray + Watershed segmentation markers. + """ + markers = morphology.erosion(marker_source, footprint=morphology.disk(radius=erosion_radius)) + markers_relabelled = morphology.label(markers, background=0) - primary_threshold_polygons = generated_polygons[primary_threshold] - secondary_threshold_polygons = generated_polygons[secondary_threshold] + markers_acceptable_size = morphology.remove_small_objects( + markers_relabelled, min_size=min_size, connectivity=1 + ) - return primary_threshold_polygons, secondary_threshold_polygons + return markers_acceptable_size -def get_polygons_from_tile( - tile: tuple[tuple[int, int], datacube.api.grid_workflow.Tile], - grid_workflow: datacube.api.GridWorkflow, - dask_chunks: dict[str, int] = {"x": 3200, "y": 3200, "time": 1}, - min_valid_observations: int = 128, - primary_threshold: float = 0.1, - secondary_threshold: float = 0.05, -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +def run_watershed( + waterbodies_for_segmentation: np.ndarray, segmentation_markers: np.ndarray +) -> np.ndarray: """ - Generate water body polygons by thresholding a WOfS All Time Summary tile. + Segment large waterbodies. Parameters ---------- - tile : tuple[tuple[int,int], datacube.api.grid_workflow.Tile] - The WOfs All Time summary Tile object for which to - generate waterbody polygons for. - grid_workflow: datacube.api.GridWorkflow, - Grid Workflow used to generate the tiles and to be used to load the Tile object. - dask_chunks : dict, optional - dask_chunks to use to load WOfS data, by default {"x": 3200, "y": 3200, "time": 1} - min_valid_observations : int, optional - Threshold to use to mask out pixels based on the number of valid WOfS observations for each pixel, by default 128 - primary_threshold : float, optional - Threshold to use to determine the location of the waterbody polygons, by default 0.1 - secondary_threshold : float, optional - Threshold to use to determine the extent / shape of the waterbodies polygons, by default 0.05 + waterbodies_for_segmentation : np.ndarray + Raster image containing the large waterbodies to be segmented. + segmentation_markers : np.ndarray + Raster image containing the watershed segmentation markers. Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame] - A tuple containing GeoDataFrames of waterbody polygons generated from thresholding WOfS All Time Summary data - using the primary and secondary thresholds. - + np.ndarray + Raster image with the large waterbodies segmented. """ - # Set up the primary and secondary thresholds. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] + distance = ndi.distance_transform_edt(waterbodies_for_segmentation) + segmented = watershed(-distance, segmentation_markers, mask=waterbodies_for_segmentation) - # Get the tile id and tile object. - tile_id = tile[0] - tile_object = tile[1] + return segmented - # Generate the waterbody polygons using the primary and secondary thresholds, - # from the tile. - try: - _log.info(f"Generating water body polygons for tile {tile_id}") - # Load the data for the tile. - wofs_alltime_summary = grid_workflow.load(tile_object, dask_chunks=dask_chunks).squeeze() - - # Set the no-data values to nan. - # Masking here is done using the frequency measurement because for multiple - # areas NaN values are present in the frequency measurement but the - # no data value -999 is not present in the count_clear and - # count_wet measurements. - # Note: it seems some pixels with NaN values in the frequency measurement - # have a value of zero in the count_clear and/or the count_wet measurements. - wofs_alltime_summary = wofs_alltime_summary.where(~np.isnan(wofs_alltime_summary.frequency)) - - # Mask pixels not observed at least min_valid_observations times. - wofs_alltime_summary_valid_clear_count = ( - wofs_alltime_summary.count_clear >= min_valid_observations - ) - - # Generate the polygons. - generated_polygons = {} - for threshold in minimum_wet_thresholds: - # Mask any pixels whose frequency of water detection is less than the threshold. - wofs_alltime_summary_valid_wetness = wofs_alltime_summary.frequency > threshold - - # Now find pixels that meet both the minimum valid observations - # and minimum wet threshold criteria. - wofs_alltime_summary_valid = wofs_alltime_summary_valid_wetness.where( - wofs_alltime_summary_valid_wetness & wofs_alltime_summary_valid_clear_count - ) - - # Convert the raster to polygons. - # We use a mask of '1' to only generate polygons around values of '1' (not NaNs). - polygons_mask = wofs_alltime_summary_valid == 1 +def confirm_extent_contains_detection(extent: np.ndarray, detection: np.ndarray) -> np.ndarray: + """ + Filter the waterbodies in the extent raster to keep only waterbodies that contain a waterbody pixel from the + detection raster. - polygons = xr_vectorize( - wofs_alltime_summary_valid, - mask=polygons_mask, - crs=wofs_alltime_summary.geobox.crs, - ) + Parameters + ---------- + extent : np.ndarray + Raster of the extent of the waterbodies. + detection : np.ndarray + Raster of the location of the waterbodies. - # Combine any overlapping polygons. - merged_polygon_geoms = shapely.ops.unary_union(polygons["geometry"]) + Returns + ------- + np.ndarray + Filtered waterbodies in the extent raster. + """ - # Turn the combined multipolygon back into a GeoDataFrame. - try: - merged_polygons = gpd.GeoDataFrame(geometry=list(merged_polygon_geoms.geoms)) - except AttributeError: - merged_polygons = gpd.GeoDataFrame(geometry=[merged_polygon_geoms]) + def sum_intensity(regionmask, intensity_image): + return np.sum(intensity_image[regionmask]) - # We need to add the crs back onto the GeoDataFrame. - merged_polygons.crs = wofs_alltime_summary.geobox.crs + props = measure.regionprops( + extent, intensity_image=detection, extra_properties=(sum_intensity,) + ) - generated_polygons[threshold] = merged_polygons + labels_to_keep = [] + for region_prop in props: + detection_count = region_prop.sum_intensity + label = region_prop.label - except Exception as error: - _log.exception( - f"\nTile {str(tile_id)} did not run. \n" - "This is probably because there are no waterbodies present in this tile." - ) - _log.exception(error) + if detection_count > 0: + labels_to_keep.append(label) - primary_threshold_polygons = generated_polygons[primary_threshold] - secondary_threshold_polygons = generated_polygons[secondary_threshold] + extent_keep = np.where(np.isin(extent, labels_to_keep), extent, 0) - return primary_threshold_polygons, secondary_threshold_polygons + return extent_keep -def get_polygons_from_tile_with_land_sea_mask_filtering( +def process_raster_polygons( tile: tuple[tuple[int, int], datacube.api.grid_workflow.Tile], grid_workflow: datacube.api.GridWorkflow, - dask_chunks: dict[str, int] = {"x": 3200, "y": 3200, "time": 1}, - min_valid_observations: int = 128, - primary_threshold: float = 0.1, - secondary_threshold: float = 0.05, + plugin: ModuleType, + dask_chunks: dict[str, int] = { + "x": 3200, + "y": 3200, + "time": 1, + }, # based on Landsat WOfS scene size + min_valid_observations: int = 60, + min_wet_thresholds: list[int | float] = [0.05, 0.1], land_sea_mask_fp: str | Path = "", - resampling_method: str = "bilinear", - filter_land_sea_mask: Callable = filter_hydrosheds_land_mask, -) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: +) -> gpd.GeoDataFrame: """ - Generate water body polygons by thresholding a WOfS All Time Summary tile. - Use a raster land/sea mask to mask out ocean pixels from the WOfS data before - vectorizing the polygons. + Generate waterbody polygons by thresholding a WOfS All Time Summary tile. Parameters ---------- tile : tuple[tuple[int,int], datacube.api.grid_workflow.Tile] - The WOfs All Time summary Tile object for which to + The WOfS All Time Summary Tile object for which to generate waterbody polygons for. grid_workflow: datacube.api.GridWorkflow, Grid Workflow used to generate the tiles and to be used to load the Tile object. + plugin: ModuleType + A validated plugin to load masks with. dask_chunks : dict, optional dask_chunks to use to load WOfS data, by default {"x": 3200, "y": 3200, "time": 1} min_valid_observations : int, optional - Threshold to use to mask out pixels based on the number of valid WOfS observations for each pixel, by default 128 - primary_threshold : float, optional - Threshold to use to determine the location of the waterbody polygons, by default 0.1 - secondary_threshold : float, optional - Threshold to use to determine the extent / shape of the waterbodies polygons, by default 0.05 + Threshold to use to mask out pixels based on the number of valid WOfS + observations for each pixel, by default 60 + min_wet_thresholds: list[int | float], optional + A list containing the extent threshold and the detection threshold, with + the extent threshold listed first, by default [0.05, 0.1] land_sea_mask_fp: str | Path, optional File path to raster to use to mask ocean pixels in WOfS data, by default "" - resampling_method: str, optional - Resampling method to use when loading the land sea mask raster, by default "bilinear" - filter_land_sea_mask: Callable, optional - Function to apply to the land sea mask xr.DataArray to generate a boolean - mask where pixels with a value of True are land pixels and pixels with a - value of False are ocean pixels, by default `filter_hydrosheds_land_mask` - Returns ------- - tuple[gpd.GeoDataFrame, gpd.GeoDataFrame] - A tuple containing GeoDataFrames of waterbody polygons generated from thresholding WOfS All Time Summary data - using the primary and secondary thresholds. - + gpd.GeoDataFrame + waterbody polygons. """ - # Set up the primary and secondary thresholds. - minimum_wet_thresholds = [secondary_threshold, primary_threshold] - - # Get the tile id and tile object. - tile_id = tile[0] - tile_object = tile[1] + # Load and threshold the WOfS All Time Summary tile. + xr_detection, xr_extent = load_wofs_frequency( + tile=tile, + grid_workflow=grid_workflow, + plugin=plugin, + dask_chunks=dask_chunks, + min_valid_observations=min_valid_observations, + min_wet_thresholds=min_wet_thresholds, + land_sea_mask_fp=land_sea_mask_fp, + ) - # Generate the waterbody polygons using the primary and secondary thresholds, - # from the tile. + # Get the crs. try: - _log.info(f"Generating water body polygons for tile {tile_id}") - - # Load the data for the tile. - wofs_alltime_summary = grid_workflow.load(tile_object, dask_chunks=dask_chunks).squeeze() - - # Load the land sea mask. - if land_sea_mask_fp: - land_sea_mask = rio_slurp_xarray( - fname=land_sea_mask_fp, - gbox=wofs_alltime_summary.geobox, - resampling=resampling_method, - ) - - # Filter the land sea mask. - boolean_land_sea_mask = filter_land_sea_mask(land_sea_mask) - - # Mask the WOfS All-Time Summary dataset using the boolean land sea mask. - wofs_alltime_summary = wofs_alltime_summary.where(boolean_land_sea_mask) - - # Set the no-data values to nan. - # Masking here is done using the frequency measurement because for multiple - # areas NaN values are present in the frequency measurement but the - # no data value -999 is not present in the count_clear and - # count_wet measurements. - # Note: it seems some pixels with NaN values in the frequency measurement - # have a value of zero in the count_clear and/or the count_wet measurements. - wofs_alltime_summary = wofs_alltime_summary.where(~np.isnan(wofs_alltime_summary.frequency)) - - # Mask pixels not observed at least min_valid_observations times. - wofs_alltime_summary_valid_clear_count = ( - wofs_alltime_summary.count_clear >= min_valid_observations - ) - - # Generate the polygons. - generated_polygons = {} - for threshold in minimum_wet_thresholds: - # Mask any pixels whose frequency of water detection is less than the threshold. - wofs_alltime_summary_valid_wetness = wofs_alltime_summary.frequency > threshold + output_crs = xr_detection.geobox.crs + except Exception as error: + _log.exception(error) + output_crs = xr_extent.geobox.crs - # Now find pixels that meet both the minimum valid observations - # and minimum wet threshold criteria. - wofs_alltime_summary_valid = wofs_alltime_summary_valid_wetness.where( - wofs_alltime_summary_valid_wetness & wofs_alltime_summary_valid_clear_count - ) + # Remove any objects of size 5 or less, as measured by connectivity=1 + np_extent_small_removed = remove_small_waterbodies(xr_extent.values.astype(int), min_size=6) - # Convert the raster to polygons. - # We use a mask of '1' to only generate polygons around values of '1' (not NaNs). - polygons_mask = wofs_alltime_summary_valid == 1 + # Identify waterbodies to apply segmentation to + np_extent_segment = select_waterbodies_for_segmentation(np_extent_small_removed, min_size=1000) + np_extent_nosegment = np.where(np_extent_segment > 0, 0, np_extent_small_removed) - polygons = xr_vectorize( - wofs_alltime_summary_valid, - mask=polygons_mask, - crs=wofs_alltime_summary.geobox.crs, - ) + # Create watershed segmentation markers by taking the detection threshold pixels and eroding them by 1 + # Includes removal of any markers smaller than 100 pixels + segmentation_markers = generate_segmentation_markers( + xr_detection.values.astype(int), erosion_radius=1, min_size=100 + ) - # Combine any overlapping polygons. - merged_polygon_geoms = shapely.ops.unary_union(polygons["geometry"]) + # Run segmentation + np_segmented_extent = run_watershed(np_extent_segment, segmentation_markers) - # Turn the combined multipolygon back into a GeoDataFrame. - try: - merged_polygons = gpd.GeoDataFrame(geometry=list(merged_polygon_geoms.geoms)) - except AttributeError: - merged_polygons = gpd.GeoDataFrame(geometry=[merged_polygon_geoms]) + # Combine segmented and non segmented back together + np_combined_extent = np.where(np_segmented_extent > 0, np_segmented_extent, np_extent_nosegment) - # We need to add the crs back onto the GeoDataFrame. - merged_polygons.crs = wofs_alltime_summary.geobox.crs + # Only keep extent areas that contain a detection pixel + np_combined_extent_contains_detection = confirm_extent_contains_detection( + np_combined_extent, xr_detection.values.astype(int) + ) - generated_polygons[threshold] = merged_polygons + # Relabel and remove small objects + np_combined_clean_label = remove_small_waterbodies( + np_combined_extent_contains_detection, min_size=6 + ) - except Exception as error: - _log.exception( - f"\nTile {str(tile_id)} did not run. \n" - "This is probably because there are no waterbodies present in this tile." - ) - _log.exception(error) + # Convert back to xarray + xr_combined_extent = xr.DataArray( + np_combined_clean_label, coords=xr_extent.coords, dims=xr_extent.dims, attrs=xr_extent.attrs + ) - primary_threshold_polygons = generated_polygons[primary_threshold] - secondary_threshold_polygons = generated_polygons[secondary_threshold] + # Vectorize + vector_combined_extent = xr_vectorize( + xr_combined_extent, crs=output_crs, mask=xr_combined_extent.values > 0 + ) - return primary_threshold_polygons, secondary_threshold_polygons + return vector_combined_extent diff --git a/deafrica_waterbodies/make_timeseries.py b/deafrica_waterbodies/make_timeseries.py index 6b3028f1..7d47b0af 100644 --- a/deafrica_waterbodies/make_timeseries.py +++ b/deafrica_waterbodies/make_timeseries.py @@ -1,34 +1,29 @@ -import collections import datetime import logging import os -import urllib from pathlib import Path -import boto3 import datacube import dateutil +import fsspec import geopandas as gpd import numpy as np import pandas as pd from datacube.utils.geometry import Geometry from deafrica_tools.datahandling import wofs_fuser from deafrica_tools.spatial import xr_rasterize -from mypy_boto3_s3.client import S3Client +from odc.stats.model import DateTimeRange from tqdm.auto import tqdm from deafrica_waterbodies.id_field import guess_id_field -from deafrica_waterbodies.io import ( - check_dir_exists, - check_file_exists, - check_if_s3_uri, -) +from deafrica_waterbodies.io import check_dir_exists, check_file_exists, check_if_s3_uri _log = logging.getLogger(__name__) def get_polygon_ids_for_missing_timeseries( - polygons_gdf: gpd.GeoDataFrame, output_directory: str | Path, s3_client: S3Client | None = None + polygons_gdf: gpd.GeoDataFrame, + output_directory: str | Path, ) -> list[str]: """ Get IDs for polygons whose timeseries .csv file does not exist @@ -41,9 +36,6 @@ def get_polygon_ids_for_missing_timeseries( output_directory : str File URI or S3 URI of the directory containing the waterbody timeseries files. - s3_client : S3Client - A low-level client representing Amazon Simple Storage Service (S3), by default None. - Returns ------- list[str] @@ -53,10 +45,6 @@ def get_polygon_ids_for_missing_timeseries( # Support pathlib paths. output_directory = str(output_directory) - # Get the service client. - if s3_client is None: - s3_client = boto3.client("s3") - polygon_ids = polygons_gdf.index.to_list() # Check if output_dir exists. @@ -75,7 +63,7 @@ def get_polygon_ids_for_missing_timeseries( def get_last_observation_date_from_csv( - csv_file_path: str | Path, s3_client: S3Client | None = None + csv_file_path: str | Path, ) -> pd.Timestamp: """ Get the date of the last observation from a water body polygon's @@ -85,32 +73,24 @@ def get_last_observation_date_from_csv( ---------- csv_file_path : str | Path S3 URI or File URI of the timeseries csv file for a waterbody polygon. - s3_client : S3Client | None - A low-level client representing Amazon Simple Storage Service (S3), by default None. - Returns ------- pd.Timestamp Date of the last observation from a water body polygon's timeseries file. """ - # Get the service client. - if s3_client is None: - s3_client = boto3.client("s3") - # Check if the file exists. if check_file_exists(csv_file_path): # Read file using pandas. # Should work for s3 files also. - timeseries_df = pd.read_csv(csv_file_path) - - # Convert to datetime. - timeseries_df["Observation Date"] = pd.to_datetime(timeseries_df["Observation Date"]) + df = pd.read_csv(csv_file_path) - # Sort in acending order - timeseries_df.sort_values(by="Observation Date", ascending=True, inplace=True) - - last_date = timeseries_df["Observation Date"].to_list()[-1] + if "date" not in df.columns: + df.sort_index(ascending=True, inplace=True) + last_date = df.index.to_list()[-1] + else: + df.sort_values(["date"], ascending=True, inplace=True) + last_date = df["date"].to_list()[-1] return last_date else: @@ -124,10 +104,8 @@ def generate_timeseries_from_wofs_ls( use_id: str, missing_only: bool = False, time_span: str = "all", - start_date: datetime.datetime | None = None, - end_date: datetime.datetime | None = None, + temporal_range: str = None, subset_polygons_ids: list[str] = [], - include_uncertainity: bool = True, ): """ Function to generate a timeseries csv file for each waterbody polygon in the @@ -152,28 +130,32 @@ def generate_timeseries_from_wofs_ls( time_span : str, optional Time span to generate the timeseries for. Valid options are `"all"`, `"custom"`, or `"append"`, by default "all" - start_date : datetime.datetime | None, optional - Start date for the time range to generate the timeseries for, if `time_span` - is set to `"custom"`, by default None - end_date : datetime.datetime | None, optional - End date for the time range to generate the timeseries for, if `time_span` - is set to `"custom"`, by default None + temporal_range: str | None, optional + Time range to generate the timeseries for, if `time_span` is set to + `"custom"`. Example '2020-05--P1M' for the month of May 2020, by default + None subset_polygons_ids : list[str], optional A list of ids of the waterbodies to generate the timeseries for from the waterbodies in `waterbodies_vector_file`. - include_uncertainity: bool, optional - Option to include uncertainities in the output timeseries. If you - specify `include_uncertainity=True` then you will only filter out - timesteps with 100% invalid pixels. If `include_uncertainity=False` - you will filter out timesteps with more than 10% invalid pixels. """ - # Get the service client. - s3_client = boto3.client("s3") + # Support pathlib paths. + waterbodies_vector_file = str(waterbodies_vector_file) + output_directory = str(output_directory) + + # Create the output directory if it does not exist. + if not check_dir_exists(output_directory): + if check_if_s3_uri(output_directory): + fs = fsspec.filesystem("s3") + else: + fs = fsspec.filesystem("file") + + fs.mkdirs(output_directory, exist_ok=True) + _log.info(f"Created directory {output_directory}") # We will be using wofs_ls data. output_crs = "EPSG:6933" resolution = (-30, 30) - # dask_chunks = {"x": 3200, "y": 3200, "time": 1} # TODO: Check if using dask speeds up runtime. + dask_chunks = {"x": 3200, "y": 3200, "time": 1} # Load the waterbody polygons. try: @@ -186,134 +168,151 @@ def generate_timeseries_from_wofs_ls( _log.info(f"Guessed ID field: {id_field}") polygons_gdf.set_index(id_field, inplace=True) + # Reproject to a projected crs. polygons_gdf = polygons_gdf.to_crs(output_crs) + assert polygons_gdf.crs.is_projected + # Select polygons using values in the id column. if subset_polygons_ids: polygons_gdf = polygons_gdf.loc[subset_polygons_ids] - # Get the IDs for the waterbody polygons. + # Get the IDs for the water body polygons with no timeseries csv file in the + # output directory. if missing_only: - polygon_ids = get_polygon_ids_for_missing_timeseries( - polygons_gdf, output_directory, s3_client=s3_client - ) + polygon_ids = get_polygon_ids_for_missing_timeseries(polygons_gdf, output_directory) else: polygon_ids = polygons_gdf.index.to_list() - # Time span is mutually exclusive with start_date and end_date. - valid_time_span_options = ["all", "custom", "append"] + if not polygon_ids: + _log.info("No polygons identified with missing timeseries.") + return [] + else: + _log.info(f"Number of polygons to generate timeseries for {len(polygons_gdf)}.") - if time_span not in valid_time_span_options: - _log.error(f"{time_span} is an invalid time span.") - raise ValueError( - f"Please select a valid time span option: {' '.join(valid_time_span_options)}" - ) + # Time span is mutually exclusive with temporal_range. + valid_time_span_options = ["all", "custom", "append"] - # Checks. - if time_span == "all": - if start_date or end_date: - _log.error("Time span set to all yet, start and end date specified.") + if time_span not in valid_time_span_options: + _log.error(f"{time_span} is an invalid time span.") raise ValueError( - "If a time span is set to 'all' do not pass a start date nor an end date." + f"Please select a valid time span option: {' '.join(valid_time_span_options)}" ) - else: - start_date_str = "1984" - end_date_str = datetime.datetime.now().strftime("%Y-%m-%d") - elif time_span == "append": - # Start date will be defined in polygons_id loop. - end_date_str = datetime.datetime.now().strftime("%Y-%m-%d") - elif time_span == "custom": - start_date_str = start_date.strftime("%Y-%m-%d") - end_date_str = end_date.strftime("%Y-%m-%d") - - # For logging purposes only. - if time_span != "append": - _log.info(f"Generating timeseries for the time range: {start_date_str} to {end_date_str}.") - - if include_uncertainity: - # Only filter out timesteps with 100% invalid pixels. - invalid_percent_threshold = 100 - else: - # Filter out timesteps with less than 90% valid pixels. - invalid_percent_threshold = 10 - # Connect to the datacube - dc = datacube.Datacube(app="deafricawaterbodies-timeseries") - - with tqdm(total=len(polygon_ids)) as bar: - for poly_id in polygon_ids: - # Polygon's timeseries file path. - - # This is specific for DE Africa waterbodies which are expected - # to have a url pointing to the expected timeseries file for the - # polygon. + # Checks. + if time_span == "all": + if temporal_range: + _log.error("Time span set to `all` yet temporal range specified.") + raise ValueError("If time span is set to 'all' do not pass a temporal range.") + else: + start_date_str = "1984" + end_date_str = datetime.datetime.now().strftime("%Y-%m-%d") + elif time_span == "append": + # Start date will be defined in polygons_id loop. + end_date_str = datetime.datetime.now().strftime("%Y-%m-%d") + elif time_span == "custom": try: - timeseries_url = polygons_gdf.loc[poly_id].timeseries - path = urllib.parse.urlparse(timeseries_url).path - csv_file = os.path.split(path)[-1] - except AttributeError: - csv_file = f"{poly_id}.csv" - - poly_timeseries_fp = os.path.join(output_directory, poly_id[:4], csv_file) - - if time_span == "append": - last_observation_date = get_last_observation_date_from_csv( - poly_timeseries_fp, s3_client - ) - start_date = last_observation_date + dateutil.relativedelta.relativedelta(days=1) - start_date_str = start_date.strftime("%Y-%m-%d") + temporal_range_ = DateTimeRange(temporal_range) + except ValueError: + _log.exception(f"Failed to parse supplied temporal_range: '{temporal_range}'") + else: + start_date_str = temporal_range_.start.strftime("%Y-%m-%d") + end_date_str = temporal_range_.end.strftime("%Y-%m-%d") - time_range = (start_date_str, end_date_str) - _log.debug( - f"Generating timeseries for {poly_id} for the time range: {time_range[0]} to {time_range[1]}." + # For logging purposes only. + if time_span != "append": + _log.info( + f"Generating timeseries for the time range: {start_date_str} to {end_date_str}." ) - poly_geom = polygons_gdf.loc[poly_id].geometry - poly_gdf = gpd.GeoDataFrame(geometry=[poly_geom], crs=output_crs) - poly_geopolygon = Geometry(geom=poly_geom, crs=output_crs) - - # Load the Water Observations from Space. - wofls_ds = dc.load( - product="wofs_ls", - geopolygon=poly_geopolygon, - time=time_range, - resolution=resolution, - output_crs=output_crs, - resampling="nearest", - group_by="solar_day", - fuse_func=wofs_fuser, - ) - wofls_da = wofls_ds.water + # Connect to the datacube + dc = datacube.Datacube(app="deafricawaterbodies-timeseries") + + generated_timeseries_fps = [] + with tqdm(total=len(polygon_ids)) as bar: + for poly_id in polygon_ids: + # Parent directory for csv files. + poly_timeseries_parent_dir = os.path.join(output_directory, poly_id[:4]) + if not check_dir_exists(poly_timeseries_parent_dir): + if check_if_s3_uri(poly_timeseries_parent_dir): + fs = fsspec.filesystem("s3") + else: + fs = fsspec.filesystem("file") + fs.mkdirs(poly_timeseries_parent_dir, exist_ok=True) + _log.info(f"Created directory {poly_timeseries_parent_dir}") + + # Polygon's timeseries file path. + poly_timeseries_fp = os.path.join(poly_timeseries_parent_dir, f"{poly_id}.csv") - # If no data is found. - if not wofls_ds: + if time_span == "append": + try: + last_observation_date = get_last_observation_date_from_csv( + poly_timeseries_fp + ) + except FileNotFoundError: + start_date_str = "1984" + _log.info( + f"Could not find last observation date for polygon {poly_id}, defaulting to using the start date {start_date_str}." + ) + else: + start_date = last_observation_date + dateutil.relativedelta.relativedelta( + days=1 + ) + start_date_str = start_date.strftime("%Y-%m-%d") + + time_range = (start_date_str, end_date_str) _log.info( - f"There is no new data for {poly_id} for the time range: {time_range[0]} to {time_range[1]}." + f"Generating timeseries for {poly_id} for the time range: {time_range[0]} to {time_range[1]}." ) - continue - else: - # Mask the loaded WOfS data using the rasterized waterbody polygon, - # if the height and width of the bounding box of the waterbody polygon - # are large than the length of a pixel. - pixel_length = resolution[1] # should be a positive number. - if ( - poly_geopolygon.boundingbox.height > pixel_length - and poly_geopolygon.boundingbox.width > pixel_length - ): - poly_mask = xr_rasterize(poly_gdf, wofls_da) - wofls_da_masked = wofls_da.where(poly_mask, np.nan) - else: - wofls_da_masked = wofls_da - - # Get the area of water in the waterbody for each timestep. - - timesteps = list(wofls_da_masked.time.values) - poly_timeseries_data_dict = collections.defaultdict(list) - for timestep in timesteps: - wofl = wofls_da_masked.sel(time=timestep) + poly_geom = polygons_gdf.loc[poly_id].geometry + poly_gdf = gpd.GeoDataFrame(geometry=[poly_geom], crs=output_crs) + poly_geopolygon = Geometry(geom=poly_geom, crs=output_crs) + + # Load the Water Observations from Space. + wofls_ds = dc.load( + product="wofs_ls", + geopolygon=poly_geopolygon, + time=time_range, + resolution=resolution, + output_crs=output_crs, + dask_chunks=dask_chunks, + resampling="nearest", + group_by="solar_day", + fuse_func=wofs_fuser, + ) - # Number of pixels in the timestep for the water body. - pixel_count = np.count_nonzero(np.isnan(wofl)) + # If no data is found. + if not wofls_ds: + _log.info( + f"There is no data for {poly_id} for the time range: {time_range[0]} to {time_range[1]}." + ) + continue + else: + wofls_da = wofls_ds.water + # Mask the loaded WOfS data using the rasterized waterbody polygon, + # if the height and width of the bounding box of the waterbody polygon + # are large than the length of a pixel. + pixel_length = abs(resolution[0]) # should be a positive number. + if ( + poly_geopolygon.boundingbox.height > pixel_length + and poly_geopolygon.boundingbox.width > pixel_length + ): + poly_mask = xr_rasterize(poly_gdf, wofls_da) + wofls_da_masked = wofls_da.where(poly_mask, np.nan) + else: + _log.info( + f"Water body polygon bounding box length and width are smaller than pixel length {pixel_length} metres." + ) + wofls_da_masked = wofls_da + + # Compute the array at this point. + wofls_da_masked = wofls_da_masked.compute() + + # Get the area of each pixel. + pixel_area = pixel_length**2 + + # Get the number of pixels for the waterbody. + pixel_count = (~np.isnan(wofls_da_masked)).sum(["x", "y"]) # Apply WOfS bitmasking to the Water Observation Feature Layers # See: the Applying WOfS Bitmasking notebook in the @@ -321,73 +320,102 @@ def generate_timeseries_from_wofs_ls( # digitalearthafrica/deafrica-sandbox-notebooks Github repository. # Number of pixels observed to be valid (clear) and dry. - valid_and_dry_count = np.count_nonzero(wofl == 0) + valid_and_dry_count = (wofls_da_masked == 0).sum(["x", "y"]) + # Percentage of pixels observed to be valid (clear) and dry. + valid_and_dry_percentage = (valid_and_dry_count / pixel_count) * 100.0 + # Area covered by valid (clear) and dry pixels. + valid_and_dry_area = valid_and_dry_count * pixel_area # Number of pixels observed to be valid (clear) and wet. - valid_and_wet_count = np.count_nonzero(wofl == 128) + valid_and_wet_count = (wofls_da_masked == 128).sum(["x", "y"]) + # Percentage of pixels observed to be valid (clear) and wet. + valid_and_wet_percentage = (valid_and_wet_count / pixel_count) * 100.0 + # Area covered by valid (clear) and wet pixels. + valid_and_wet_area = valid_and_wet_count * pixel_area # Number of valid (clear) pixels. valid_count = valid_and_dry_count + valid_and_wet_count # Number of invalid (not clear) pixels. invalid_count = pixel_count - valid_count - - # Convert the counts into percentages. - try: - valid_and_wet_percentage = (valid_and_wet_count / pixel_count) * 100 - except ZeroDivisionError: - valid_and_wet_percentage = 0 - try: - valid_and_dry_percentage = (valid_and_dry_count / pixel_count) * 100 - except ZeroDivisionError: - valid_and_dry_percentage = 0 - try: - invalid_percentage = (invalid_count / pixel_count) * 100 - except ZeroDivisionError: - invalid_percentage = 0 - - # Filter the timesteps based on the invalid pixel percentage - # threshold. - # If above threshold, set timeseries values for the timestep - # as empty string. - if invalid_percentage >= invalid_percent_threshold: - valid_and_wet_percentage = "" - valid_and_wet_count = "" - valid_and_dry_percentage = "" - valid_and_dry_count = "" - invalid_percentage = "" - invalid_count = "" - - # Convert the timestep date from numpy.datetime64 to string. - observation_date = pd.to_datetime(timestep) - observation_date_str = observation_date.strftime("%Y-%m-%d") - - poly_timeseries_data_dict["Observation Date"].extend([observation_date_str]) - # poly_timeseries_data_dict["Total pixel count"].extend([pixel_count]) - poly_timeseries_data_dict["Wet pixel percentage"].extend( - [valid_and_wet_percentage] + # Area covered by invalid (not clear) pixels. + invalid_area = invalid_count * pixel_area + # Percentage of invalid pixels. + invalid_percentage = (invalid_count / pixel_count) * 100.0 + + # Create dataframes from the xarray.DataArrays. + valid_and_wet_percentage_df = valid_and_wet_percentage.to_dataframe( + name="pc_wet" + ).drop(columns="spatial_ref", errors="ignore") + valid_and_wet_count_df = valid_and_wet_count.to_dataframe(name="px_wet").drop( + columns="spatial_ref", errors="ignore" + ) + valid_and_wet_area_df = valid_and_wet_area.to_dataframe( + name="area_wet_m2" + ).drop(columns="spatial_ref", errors="ignore") + valid_and_dry_percentage_df = valid_and_dry_percentage.to_dataframe( + name="pc_dry" + ).drop(columns="spatial_ref", errors="ignore") + valid_and_dry_count_df = valid_and_dry_count.to_dataframe(name="px_dry").drop( + columns="spatial_ref", errors="ignore" + ) + valid_and_dry_area_df = valid_and_dry_area.to_dataframe( + name="area_dry_m2" + ).drop(columns="spatial_ref", errors="ignore") + invalid_percentage_df = invalid_percentage.to_dataframe(name="pc_invalid").drop( + columns="spatial_ref", errors="ignore" ) - poly_timeseries_data_dict["Wet pixel count"].extend([valid_and_wet_count]) - poly_timeseries_data_dict["Dry pixel percentage"].extend( - [valid_and_dry_percentage] + invalid_count_df = invalid_count.to_dataframe(name="px_invalid").drop( + columns="spatial_ref", errors="ignore" ) - poly_timeseries_data_dict["Dry pixel count"].extend([valid_and_dry_count]) - poly_timeseries_data_dict["Invalid pixel percentage"].extend( - [invalid_percentage] + invalid_area_df = invalid_area.to_dataframe(name="area_invalid_m2").drop( + columns="spatial_ref", errors="ignore" ) - poly_timeseries_data_dict["Invalid pixel count"].extend([invalid_count]) - # Convert the timeseries data dictionary for the polygon into - # a DataFrame. - poly_timeseries_df = pd.DataFrame(poly_timeseries_data_dict) + # Merge the individual dataframes into a single dataframe. + timeseries_df = pd.concat( + [ + valid_and_wet_percentage_df, + valid_and_wet_count_df, + valid_and_wet_area_df, + valid_and_dry_percentage_df, + valid_and_dry_count_df, + valid_and_dry_area_df, + invalid_percentage_df, + invalid_count_df, + invalid_area_df, + ], + ignore_index=False, + join="outer", + axis="columns", + ) - if time_span == "append": - # Append the DataFrame to an existing csv file. - poly_timeseries_df.to_csv( - poly_timeseries_fp, mode="a", index=False, header=False + # Set pc_wet and pc_dry values to nan, which will be used if insufficient pixels are observed. + timeseries_df["pc_wet"] = timeseries_df.apply( + lambda row: np.nan if row.pc_invalid > 10.0 else row.pc_wet, axis=1 ) - else: - # Write the DataFrame to a new csv file. - poly_timeseries_df.to_csv(poly_timeseries_fp, mode="w", index=False) - bar.update(1) - _log.info(f"Done! Generated timeseries for {len(polygon_ids)}.") + timeseries_df["pc_dry"] = timeseries_df.apply( + lambda row: np.nan if row.pc_invalid > 10.0 else row.pc_dry, axis=1 + ) + + # Parse the datetime index. + timeseries_df.index = timeseries_df.index.strftime("%Y-%m-%d") + + # Sort by date. + timeseries_df.sort_index(ascending=True, inplace=True) + + if time_span == "append": + # Append the DataFrame to an existing csv file. + timeseries_df.to_csv( + poly_timeseries_fp, mode="a", index=False, header=False + ) + _log.info(f"Timeseries appended to csv file {poly_timeseries_fp}") + else: + # Write the DataFrame to a new csv file. + timeseries_df.to_csv(poly_timeseries_fp, mode="w", index=False) + _log.info(f"Timeseries written to file {poly_timeseries_fp}") + + generated_timeseries_fps.append(poly_timeseries_fp) + bar.update(1) + _log.info(f"Done! Generated timeseries for {len(polygon_ids)} polygons.") + return generated_timeseries_fps diff --git a/deafrica_waterbodies/plugins/ocean_filtering_using_goas.py b/deafrica_waterbodies/plugins/ocean_filtering_using_goas.py new file mode 100644 index 00000000..4611cfc1 --- /dev/null +++ b/deafrica_waterbodies/plugins/ocean_filtering_using_goas.py @@ -0,0 +1,60 @@ +""" +Ocean filtering using Marine Regions Global Oceans and Seas v01 +""" +import os + +import geopandas as gpd +import numpy as np +import xarray as xr +from deafrica_tools.spatial import xr_rasterize + +from deafrica_waterbodies.plugins.utils import erode_land_sea_mask + +# File extensions to recognise as Parquet files. +PARQUET_EXTENSIONS = {".pq", ".parquet"} + + +def load_land_sea_mask( + land_sea_mask_fp: str, + wofs_alltime_summary_ds: xr.DataArray, + buffer_dist_m: float = 500, +) -> xr.DataArray: + """ + Load the Marine Regions Global Oceans and Seas v01 from the file path + provided. Rasterize the vector data to match the loaded datacube WOfS + All Time Summary data and transform the raster into + a boolean mask where 0/False are ocean pixels and 1/True are land pixels. + Erode the land pixels by the `buffer_dist_m` buffer distance. + + Parameters + ---------- + land_sea_mask_fp : str + File path to the Marine Regions Global Oceans and Seas v01 vector data. + wofs_alltime_summary_ds : xr.DataArray + Loaded datacube WOfS All Time Summary data to match to. + buffer_dist_m : float + Distance in meters to erode the land by in the land/sea mask. + + Returns + ------- + xr.DataArray + A boolean land and sea mask from the Marine Regions Global Oceans and Seas v01 data. + """ + + _, file_extension = os.path.splitext(land_sea_mask_fp) + if file_extension not in PARQUET_EXTENSIONS: + land_sea_mask_gdf = gpd.read_file(land_sea_mask_fp).to_crs( + wofs_alltime_summary_ds.geobox.crs + ) + else: + land_sea_mask_gdf = gpd.read_parquet(land_sea_mask_fp).to_crs( + wofs_alltime_summary_ds.geobox.crs + ) + + land_sea_mask_ds = xr_rasterize(land_sea_mask_gdf, wofs_alltime_summary_ds) + boolean_land_sea_mask = np.logical_not(land_sea_mask_ds) + + # Erode the land in the land sea mask + eroded_boolean_land_sea_mask = erode_land_sea_mask(boolean_land_sea_mask, buffer_dist_m) + + return eroded_boolean_land_sea_mask diff --git a/deafrica_waterbodies/plugins/ocean_filtering_using_hydrosheds.py b/deafrica_waterbodies/plugins/ocean_filtering_using_hydrosheds.py new file mode 100644 index 00000000..439df95b --- /dev/null +++ b/deafrica_waterbodies/plugins/ocean_filtering_using_hydrosheds.py @@ -0,0 +1,57 @@ +""" +Ocean filtering using HydroSHEDS Land Mask +""" +import xarray as xr +from datacube.testutils.io import rio_slurp_xarray + +from deafrica_waterbodies.plugins.utils import erode_land_sea_mask + + +def transform_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray: + """ + Function to transform the HydroSHEDs Land Mask into a boolean mask where + 0/False are ocean pixels and 1/True are land pixels. + """ + # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data. + boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2) + + return boolean_mask + + +def load_land_sea_mask( + land_sea_mask_fp: str, + wofs_alltime_summary_ds: xr.DataArray, + buffer_dist_m: float = 500, +) -> xr.DataArray: + """ + Load and reproject the HydroSHEDS Land Mask raster from the file path provided to + match the loaded datacube WOfS All Time Summary data. Transform the loaded raster into + a boolean mask where 0/False are ocean pixels and 1/True are land pixels and erode the land + pixels by the `buffer_dist_m` buffer distance. + + Parameters + ---------- + land_sea_mask_fp : str + File path to the HydroSHEDS Land Mask raster. + wofs_alltime_summary_ds : xr.DataArray + Loaded datacube WOfS All Time Summary data to match to + buffer_dist_m : float + Distance in meters to erode the land by in the land/sea mask. + Returns + ------- + xr.DataArray + A boolean land and sea mask from the HydroSHEDs Land Mask. + """ + land_sea_mask_ds = rio_slurp_xarray( + fname=land_sea_mask_fp, + gbox=wofs_alltime_summary_ds.geobox, + resampling="bilinear", + ) + + # Filter the land sea mask. + boolean_land_sea_mask = transform_hydrosheds_land_mask(land_sea_mask_ds) + + # Erode the land in the land sea mask + eroded_boolean_land_sea_mask = erode_land_sea_mask(boolean_land_sea_mask, buffer_dist_m) + + return eroded_boolean_land_sea_mask diff --git a/deafrica_waterbodies/plugins/utils.py b/deafrica_waterbodies/plugins/utils.py new file mode 100644 index 00000000..1c4e8f6f --- /dev/null +++ b/deafrica_waterbodies/plugins/utils.py @@ -0,0 +1,73 @@ +""" +Matthew Alger, Vanessa Newey, Alex Leith +Geoscience Australia +2021 +""" +import importlib.util +from pathlib import Path +from types import ModuleType + +import skimage +import xarray as xr + + +def run_plugin(plugin_path: str | Path) -> ModuleType: + """Run a Python plugin from a path. + + Arguments + --------- + plugin_path : str | Path + Path to Python plugin file. + + Returns + ------- + module + """ + plugin_path = str(plugin_path) + + spec = importlib.util.spec_from_file_location("deafrica_waterbodies.plugin", plugin_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def validate_plugin(plugin: ModuleType): + """Check that a plugin declares required globals.""" + # Check globals. + required_globals = [ + "load_land_sea_mask", + ] + for name in required_globals: + if not hasattr(plugin, name): + raise ValueError(f"Plugin missing {name}") + + # Check that functions are runnable. + required_functions = ["load_land_sea_mask"] + for name in required_functions: + assert hasattr(getattr(plugin, name), "__call__") + + +def erode_land_sea_mask(boolean_land_sea_mask: xr.DataArray, buffer_dist_m: float) -> xr.DataArray: + """ + Shrink the land in the land/sea mask. + + Parameters + ---------- + boolean_land_sea_mask : xr.DataArray + Boolean mask where 0/False are ocean pixels and 1/True are land pixels. + buffer_dist_m : float + Distance in meters to erode the land by in the land/sea mask. + + Returns + ------- + xr.DataArray + Eroded land sea mask where 0/False are ocean pixels and 1/True are land pixels. + """ + buffer_pixels = buffer_dist_m / abs(boolean_land_sea_mask.geobox.resolution[0]) + + eroded_boolean_land_sea_mask = xr.apply_ufunc( + skimage.morphology.binary_erosion, + boolean_land_sea_mask, + skimage.morphology.disk(buffer_pixels), + ) + return eroded_boolean_land_sea_mask diff --git a/deafrica_waterbodies/tiling.py b/deafrica_waterbodies/tiling.py index 22f1a1ff..152da9ca 100644 --- a/deafrica_waterbodies/tiling.py +++ b/deafrica_waterbodies/tiling.py @@ -29,22 +29,20 @@ def check_tile_intersects_polygons( tuple Tile id if the extent of the geobox of a tile intersects with the polygons. """ - tile_id = tile[0] - tile_extent = tile[1].geobox.extent if polygons_gdf is not None: # Reproject the extent of the geobox of a tile to match the polygons. - tile_extent = tile_extent.to_crs(polygons_gdf.crs) + tile_extent = tile[1].geobox.extent.to_crs(polygons_gdf.crs) # Get the shapely geometry of the reprojected extent of the tile's geobox. tile_extent_geom = tile_extent.geom # Check if the extent intersects with any of the polygons. check_intersection = polygons_gdf.geometry.intersects(tile_extent_geom).any() if check_intersection: - return tile_id + return tile[0] else: return () else: - return tile_id + return tile[0] def filter_tiles( @@ -105,8 +103,7 @@ def tile_wofs_ls_summary_alltime( # wofs_ls_summary_alltime grid. # Regular grid. - grid = "africa_30" - grid, gridspec = parse_gridspec_with_name(grid) + grid, gridspec = parse_gridspec_with_name(s="africa_30") # Multiply the tile size. tile_size = tuple(tile_size_factor * elem for elem in gridspec.tile_size) @@ -134,11 +131,11 @@ def tile_wofs_ls_summary_alltime( return tiles, gw -def get_tiles_ids( +def get_wofs_ls_summary_alltime_tiles( aoi_gdf: gpd.GeoDataFrame | None, tile_size_factor: float = 2, num_workers: int = 8 -) -> list[tuple[int, int]]: +) -> tuple[dict, datacube.api.GridWorkflow]: """ - Get the tile ids of the WOfS All Time Summary whose extents intersect + Get the tile of the WOfS All Time Summary whose extents intersect with any of the area of interest polygons. Parameters @@ -148,18 +145,26 @@ def get_tiles_ids( tile_size_factor : float, optional Number of times to increase the regular tile size when tiling the wofs_ls_summary_alltime product by, by default 2 - num_workers : int, datasetsoptional + num_workers : int, optional Number of worker processes to use when filtering tiles, by default 8 Returns ------- - list[tuple[int, int]] - Tile ids of the WOfS All Time Summary tiles whose extents intersect + tuple[dict, datacube.api.GridWorkflow] + WOfS All Time Summary tiles whose extents intersect with any of the area of interest polygons. + GridWorkflow to use to load the tiles. """ - tiles = tile_wofs_ls_summary_alltime(tile_size_factor=tile_size_factor) + tiles, grid_workflow = tile_wofs_ls_summary_alltime(tile_size_factor=tile_size_factor) # Filter the tiles to the area of interest. - filtered_tile_ids = filter_tiles(tiles, aoi_gdf, num_workers) - - return filtered_tile_ids + if aoi_gdf is not None: + filtered_tile_ids = filter_tiles(tiles=tiles, polygons_gdf=aoi_gdf, num_workers=num_workers) + filtered_tiles = {k: v for k, v in tiles.items() if k in filtered_tile_ids} + _log.info(f"Filtered out {len(tiles) - len(filtered_tiles)} tiles.") + _log.info( + f"Number of wofs_ls_summary_alltime tiles covering the area of interest: {len(filtered_tiles)}" + ) + return filtered_tiles, grid_workflow + else: + return tiles, grid_workflow diff --git a/figures/AgricultureArea.JPG b/figures/AgricultureArea.JPG deleted file mode 100644 index b3747abb..00000000 Binary files a/figures/AgricultureArea.JPG and /dev/null differ diff --git a/figures/AllThresholds.JPG b/figures/AllThresholds.JPG deleted file mode 100644 index 51dc25a4..00000000 Binary files a/figures/AllThresholds.JPG and /dev/null differ diff --git a/figures/CoastLimitations.JPG b/figures/CoastLimitations.JPG deleted file mode 100644 index 22593864..00000000 Binary files a/figures/CoastLimitations.JPG and /dev/null differ diff --git a/figures/Cotton.JPG b/figures/Cotton.JPG deleted file mode 100644 index 9d36fd88..00000000 Binary files a/figures/Cotton.JPG and /dev/null differ diff --git a/figures/DEAWaterbodiesESRIBasemap.jpeg b/figures/DEAWaterbodiesESRIBasemap.jpeg deleted file mode 100644 index cac7b502..00000000 Binary files a/figures/DEAWaterbodiesESRIBasemap.jpeg and /dev/null differ diff --git a/figures/DEAfricaWaterbodiesESRIBasemap.png b/figures/DEAfricaWaterbodiesESRIBasemap.png new file mode 100644 index 00000000..26637fe7 Binary files /dev/null and b/figures/DEAfricaWaterbodiesESRIBasemap.png differ diff --git a/figures/ExpectedRunningContainers.png b/figures/ExpectedRunningContainers.png new file mode 100644 index 00000000..15e1bdb6 Binary files /dev/null and b/figures/ExpectedRunningContainers.png differ diff --git a/figures/HighTideCoastline.JPG b/figures/HighTideCoastline.JPG deleted file mode 100644 index 798465b1..00000000 Binary files a/figures/HighTideCoastline.JPG and /dev/null differ diff --git a/figures/HybridThreshold.JPG b/figures/HybridThreshold.JPG deleted file mode 100644 index a8d4b40f..00000000 Binary files a/figures/HybridThreshold.JPG and /dev/null differ diff --git a/figures/ITEMAustralianShapefile.JPG b/figures/ITEMAustralianShapefile.JPG deleted file mode 100644 index d58092b1..00000000 Binary files a/figures/ITEMAustralianShapefile.JPG and /dev/null differ diff --git a/figures/LakeFrome.JPG b/figures/LakeFrome.JPG deleted file mode 100644 index 33ee0756..00000000 Binary files a/figures/LakeFrome.JPG and /dev/null differ diff --git a/figures/MissingPolygons.PNG b/figures/MissingPolygons.PNG deleted file mode 100644 index b6356196..00000000 Binary files a/figures/MissingPolygons.PNG and /dev/null differ diff --git a/figures/No0.01.JPG b/figures/No0.01.JPG deleted file mode 100644 index 29da8165..00000000 Binary files a/figures/No0.01.JPG and /dev/null differ diff --git a/figures/No0.02.JPG b/figures/No0.02.JPG deleted file mode 100644 index 7d105b3e..00000000 Binary files a/figures/No0.02.JPG and /dev/null differ diff --git a/figures/OnRiverDam.JPG b/figures/OnRiverDam.JPG deleted file mode 100644 index f0d98e0b..00000000 Binary files a/figures/OnRiverDam.JPG and /dev/null differ diff --git a/figures/PPtestlessthan005.JPG b/figures/PPtestlessthan005.JPG deleted file mode 100644 index 921d3a71..00000000 Binary files a/figures/PPtestlessthan005.JPG and /dev/null differ diff --git a/figures/ThresholdCompare.JPG b/figures/ThresholdCompare.JPG deleted file mode 100644 index a06f3e2a..00000000 Binary files a/figures/ThresholdCompare.JPG and /dev/null differ diff --git a/figures/WACoastlineNoise.JPG b/figures/WACoastlineNoise.JPG deleted file mode 100644 index cd679c2e..00000000 Binary files a/figures/WACoastlineNoise.JPG and /dev/null differ diff --git a/figures/WorkflowDiagram.JPG b/figures/WorkflowDiagram.JPG index bd2b1e77..a3bd5175 100644 Binary files a/figures/WorkflowDiagram.JPG and b/figures/WorkflowDiagram.JPG differ diff --git a/figures/dea_logo.jpg b/figures/dea_logo.jpg deleted file mode 100644 index d61f2ac9..00000000 Binary files a/figures/dea_logo.jpg and /dev/null differ diff --git a/figures/dea_logo_wide.jpg b/figures/dea_logo_wide.jpg deleted file mode 100644 index 2d999172..00000000 Binary files a/figures/dea_logo_wide.jpg and /dev/null differ diff --git a/figures/deafrica_logo.jpg b/figures/deafrica_logo.jpg new file mode 100644 index 00000000..a7319191 Binary files /dev/null and b/figures/deafrica_logo.jpg differ diff --git a/figures/deafrica_logo_wide.jpg b/figures/deafrica_logo_wide.jpg new file mode 100644 index 00000000..db263f6f Binary files /dev/null and b/figures/deafrica_logo_wide.jpg differ diff --git a/figures/menindeeLakes.JPG b/figures/menindeeLakes.JPG deleted file mode 100644 index 1876a30b..00000000 Binary files a/figures/menindeeLakes.JPG and /dev/null differ diff --git a/figures/skimageMeasureLabelBlobs.PNG b/figures/skimageMeasureLabelBlobs.PNG deleted file mode 100644 index 09800455..00000000 Binary files a/figures/skimageMeasureLabelBlobs.PNG and /dev/null differ diff --git a/index_tiles.sh b/index_tiles.sh index 51fe8d2e..495a2b0d 100644 --- a/index_tiles.sh +++ b/index_tiles.sh @@ -1,14 +1,11 @@ #!/bin/bash # Add the wofs_ls_summary_alltime datasets. -s3-to-dc "s3://deafrica-services/wofs_ls_summary_alltime/1-0-0/x164/y098/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls_summary_alltime' - -# Add the wofs_ls datasets covering the waterbody UID: edumesbb2 -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/204/048/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/204/049/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' - -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/205/048/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/205/049/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' - -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/206/048/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' -s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/206/049/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' \ No newline at end of file +s3-to-dc "s3://deafrica-services/wofs_ls_summary_alltime/1-0-0/x194/y117/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls_summary_alltime' + +# Add the wofs_ls datasets covering the waterbody UID: sm9rtw98n +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/187/038/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/188/037/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/188/038/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/189/037/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/189/038/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' diff --git a/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDs.ipynb b/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDs.ipynb deleted file mode 100644 index dc2af9db..00000000 --- a/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDs.ipynb +++ /dev/null @@ -1,348 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "727a0f08-2a9c-471c-bbea-346bf528c856", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import datacube\n", - "import fsspec\n", - "\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import check_dir_exists, check_file_exists, check_if_s3_uri\n", - "from deafrica_waterbodies.make_polygons import check_wetness_thresholds, get_polygons_from_dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "25b37fcb-eb02-4989-8794-326ed93a37b1", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f958dc31-a953-4b8b-b045-63029ec43d78", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "primary_threshold: float = 0.1\n", - "secondary_threshold: float = 0.05\n", - "minimum_valid_observations: int = 128\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile\"\n", - "dataset_ids_text_file = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt\"\n", - "overwrite = True" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "906996f9-638a-4937-a3e4-7c561a472192", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cdefc70d-f19d-4a9a-80a6-6810c2eef674", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib paths.\n", - "output_directory = str(output_directory)\n", - "dataset_ids_text_file = str(dataset_ids_text_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d87d2e53-fe36-43f6-81c3-45495c4d7a49", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters to use when loading datasets.\n", - "dask_chunks = {\"x\": 3200, \"y\": 3200, \"time\": 1}\n", - "resolution = (-30, 30)\n", - "output_crs = \"EPSG:6933\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3c0a4fca-9707-44ef-812d-bf8a1f42308f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 17:02:05,550] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n" - ] - } - ], - "source": [ - "# Read the dataset ids from the text file.\n", - "if not check_file_exists(dataset_ids_text_file):\n", - " _log.error(f\"Could not find text file {dataset_ids_text_file}!\")\n", - " raise FileNotFoundError(f\"Could not find text file {dataset_ids_text_file}!\")\n", - "else:\n", - " if check_if_s3_uri(dataset_ids_text_file):\n", - " fs = fsspec.filesystem(\"s3\")\n", - " else:\n", - " fs = fsspec.filesystem(\"file\")\n", - " with fs.open(dataset_ids_text_file, \"r\") as file:\n", - " lines = file.readlines()\n", - " dataset_ids = [line.strip() for line in lines]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "20679b58-4de2-4eea-9ab3-85728e6aa896", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory to write generated waterbody polygons to.\n", - "polygons_from_thresholds_dir = os.path.join(output_directory, \"polygons_from_thresholds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "af2a667f-70fd-43f9-90b9-49a84f39f30c", - "metadata": {}, - "outputs": [], - "source": [ - "# Set the filesystem to use.\n", - "if check_if_s3_uri(polygons_from_thresholds_dir):\n", - " fs = fsspec.filesystem(\"s3\")\n", - "else:\n", - " fs = fsspec.filesystem(\"file\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "d42cefd7-508c-4fdc-aa22-49e9ee5751ea", - "metadata": {}, - "outputs": [], - "source": [ - "# Check if the directory exists. If it does not, create it.\n", - "if not check_dir_exists(polygons_from_thresholds_dir):\n", - " fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)\n", - " _log.info(f\"Created directory {polygons_from_thresholds_dir}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "9f9fec1f-e5df-4190-aaae-910e68acbcd1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 17:02:05,986] {3616587049.py:3} INFO - We will be running a hybrid wetness threshold. \n", - "**You have set 0.1 as the primary threshold, which will define the location of the waterbody polygons \n", - " with 0.05 set as the supplementary threshold, which will define the extent/shape of the waterbody polygons.**\n" - ] - } - ], - "source": [ - "# Check if the wetness thresholds have been set correctly.\n", - "minimum_wet_thresholds = [secondary_threshold, primary_threshold]\n", - "_log.info(check_wetness_thresholds(minimum_wet_thresholds))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a86b5866-6b56-4a6b-919b-20e11334ad6f", - "metadata": {}, - "outputs": [], - "source": [ - "# Connect to the datacube.\n", - "dc = datacube.Datacube(app=\"GenerateWaterbodyPolygons\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c0dd33a2-9726-471d-a245-8564bafb594b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 17:02:06,335] {make_polygons.py:177} INFO - Generating water body polygons for dataset cd198bae-43a1-566a-8e8e-01b110bfbaf5\n", - "[2023-10-06 17:02:08,905] {make_polygons.py:177} INFO - Generating water body polygons for dataset fc10a5ae-00d0-5998-bbc0-b7d29f5807fb\n", - "[2023-10-06 17:02:10,824] {make_polygons.py:177} INFO - Generating water body polygons for dataset bb6c330e-f7c9-5164-85a6-a10e5ed36ce8\n", - "[2023-10-06 17:02:13,396] {make_polygons.py:177} INFO - Generating water body polygons for dataset 3180edab-0678-59a6-9cce-70437f6d8e8b\n", - "[2023-10-06 17:02:15,746] {make_polygons.py:177} INFO - Generating water body polygons for dataset f6d24d9a-4399-5d5e-9a0b-b4edfcea710f\n", - "[2023-10-06 17:02:17,801] {make_polygons.py:177} INFO - Generating water body polygons for dataset 68180140-e074-5c12-a5e7-8ed3d0dee5a9\n", - "[2023-10-06 17:02:20,064] {make_polygons.py:177} INFO - Generating water body polygons for dataset 7e5d3cf6-2dd0-5830-b6dc-39c1ecc69713\n", - "[2023-10-06 17:02:22,172] {make_polygons.py:177} INFO - Generating water body polygons for dataset 5c4cf4b6-c649-5205-8152-1b1b8797ef4c\n", - "[2023-10-06 17:02:24,729] {make_polygons.py:177} INFO - Generating water body polygons for dataset 1e8d81b3-241f-507b-bc0b-6accf8fb995d\n", - "[2023-10-06 17:02:26,826] {make_polygons.py:177} INFO - Generating water body polygons for dataset 2a070692-152f-5bce-9746-304ae9f473e6\n", - "[2023-10-06 17:02:28,773] {make_polygons.py:177} INFO - Generating water body polygons for dataset 84e76979-16bb-5a28-8846-59f50b97a6bf\n", - "[2023-10-06 17:02:30,497] {make_polygons.py:177} INFO - Generating water body polygons for dataset b886a2e3-6342-564c-8ded-06cf45a5a115\n", - "[2023-10-06 17:02:33,124] {make_polygons.py:177} INFO - Generating water body polygons for dataset 66c72025-c94c-5a82-bea8-e0cf8edb306e\n", - "[2023-10-06 17:02:37,314] {make_polygons.py:177} INFO - Generating water body polygons for dataset 832547b8-3409-5be3-a9b2-0cfb9e79d43b\n", - "[2023-10-06 17:02:39,147] {make_polygons.py:177} INFO - Generating water body polygons for dataset a55e2595-a9f8-5513-90df-ebeb9141371f\n", - "[2023-10-06 17:02:41,056] {make_polygons.py:177} INFO - Generating water body polygons for dataset 4aaef995-885e-55a2-9c0e-4cecf9d0c3f4\n", - "[2023-10-06 17:02:42,907] {make_polygons.py:177} INFO - Generating water body polygons for dataset 12688939-1640-5fef-8155-fedfe3ecd30b\n", - "[2023-10-06 17:02:45,349] {make_polygons.py:177} INFO - Generating water body polygons for dataset b6cb8e82-1879-5706-b93c-ef85f032f470\n", - "[2023-10-06 17:02:47,922] {make_polygons.py:177} INFO - Generating water body polygons for dataset d31ecb6f-6459-5659-a841-f7a84c824af9\n", - "[2023-10-06 17:02:50,067] {make_polygons.py:177} INFO - Generating water body polygons for dataset 992cf08a-0750-5db6-b409-2fe141b84ec2\n", - "[2023-10-06 17:02:51,856] {make_polygons.py:177} INFO - Generating water body polygons for dataset ef01bdfe-abc6-5618-a779-2489df1d5d73\n", - "[2023-10-06 17:02:54,129] {make_polygons.py:177} INFO - Generating water body polygons for dataset f191276a-d1a6-50ac-83a9-de5b279b4229\n", - "[2023-10-06 17:02:58,136] {make_polygons.py:177} INFO - Generating water body polygons for dataset 217fc46c-d098-5639-ad0a-b8d0efbc4276\n", - "[2023-10-06 17:03:14,365] {make_polygons.py:177} INFO - Generating water body polygons for dataset 87916cb1-d351-529c-a42a-07eff309ed8b\n", - "[2023-10-06 17:03:16,399] {make_polygons.py:177} INFO - Generating water body polygons for dataset a27df8f9-e8a5-5ab9-a962-18faf3ce3ae2\n", - "[2023-10-06 17:03:19,051] {make_polygons.py:177} INFO - Generating water body polygons for dataset 74781945-fca1-5be1-a86c-776a1770d8e3\n", - "[2023-10-06 17:03:21,004] {make_polygons.py:177} INFO - Generating water body polygons for dataset d49e6bfd-d0f1-5df8-9da4-ec198150f402\n", - "[2023-10-06 17:03:23,258] {make_polygons.py:177} INFO - Generating water body polygons for dataset b5997e76-bce0-5a43-ac14-5c5ba69ae65a\n", - "[2023-10-06 17:03:29,177] {make_polygons.py:177} INFO - Generating water body polygons for dataset 52f8f266-f8da-5269-aed6-007e26c131e0\n", - "[2023-10-06 17:03:31,254] {make_polygons.py:177} INFO - Generating water body polygons for dataset d72ef170-3533-518b-90ce-8debfc37aedb\n", - "[2023-10-06 17:03:41,076] {make_polygons.py:177} INFO - Generating water body polygons for dataset e413b8ed-5eb2-591f-9a37-8b7825d37dcf\n", - "[2023-10-06 17:03:43,122] {make_polygons.py:177} INFO - Generating water body polygons for dataset 1dcf68e4-01b5-55d1-9b74-0162af9968fa\n", - "[2023-10-06 17:03:44,876] {make_polygons.py:177} INFO - Generating water body polygons for dataset 7762c98a-80d0-548a-ac06-39f763e25c35\n", - "[2023-10-06 17:03:46,760] {make_polygons.py:177} INFO - Generating water body polygons for dataset 2d927c3d-b259-574f-8a88-983e9c835bd5\n", - "[2023-10-06 17:03:48,713] {make_polygons.py:177} INFO - Generating water body polygons for dataset 11abb94d-e3e8-5927-beeb-b5c6fe2f471f\n", - "[2023-10-06 17:03:51,861] {make_polygons.py:177} INFO - Generating water body polygons for dataset 55016bd8-ada1-5327-96eb-b68848f800b3\n", - "[2023-10-06 17:03:53,706] {make_polygons.py:177} INFO - Generating water body polygons for dataset 818adac5-2349-5e72-95f9-34d1ab908668\n", - "[2023-10-06 17:03:55,855] {make_polygons.py:177} INFO - Generating water body polygons for dataset 506157af-33c8-5596-9176-c86db467880a\n", - "[2023-10-06 17:03:57,993] {make_polygons.py:177} INFO - Generating water body polygons for dataset 89ade063-61c0-568b-a8fc-7a641319a1ce\n", - "[2023-10-06 17:04:00,292] {make_polygons.py:177} INFO - Generating water body polygons for dataset 92d4584f-574b-57bb-be2b-4e08291c1796\n", - "[2023-10-06 17:04:06,030] {make_polygons.py:177} INFO - Generating water body polygons for dataset ee8c6eca-a29b-5449-8896-289436371bb4\n", - "[2023-10-06 17:04:08,392] {make_polygons.py:177} INFO - Generating water body polygons for dataset 41751ede-7517-578d-aca7-c2d1694c31aa\n", - "[2023-10-06 17:04:11,005] {make_polygons.py:177} INFO - Generating water body polygons for dataset 28435ed4-8593-5fbb-9f01-87a68c6ca593\n", - "[2023-10-06 17:04:12,905] {make_polygons.py:177} INFO - Generating water body polygons for dataset 9ba989f3-b68b-5027-b6e8-aefc20721dea\n", - "[2023-10-06 17:04:15,241] {make_polygons.py:177} INFO - Generating water body polygons for dataset 62c2b19d-7ea4-50c3-b1b6-067d42eb2638\n", - "[2023-10-06 17:04:17,338] {make_polygons.py:177} INFO - Generating water body polygons for dataset 9ee761e1-189e-53b7-81a3-6f0be8c98934\n", - "[2023-10-06 17:04:22,841] {make_polygons.py:177} INFO - Generating water body polygons for dataset 6c8c71d3-0c01-52b9-a208-f96dd673971d\n", - "[2023-10-06 17:04:25,992] {make_polygons.py:177} INFO - Generating water body polygons for dataset 491be0b7-aac0-5aae-8585-c32201050149\n", - "[2023-10-06 17:04:28,021] {make_polygons.py:177} INFO - Generating water body polygons for dataset f1b1b0cb-d8b8-5d13-9c95-161c60a9bfe6\n", - "[2023-10-06 17:04:29,805] {make_polygons.py:177} INFO - Generating water body polygons for dataset 3ad86dbf-4c1d-515a-8e02-2ba7664edcfc\n", - "[2023-10-06 17:04:31,875] {make_polygons.py:177} INFO - Generating water body polygons for dataset 3be3887b-db05-527f-91c3-c9f2d201460e\n", - "[2023-10-06 17:04:34,081] {make_polygons.py:177} INFO - Generating water body polygons for dataset 5c5a792d-7129-5345-89b8-ee159eeb8357\n", - "[2023-10-06 17:04:36,364] {make_polygons.py:177} INFO - Generating water body polygons for dataset dbce025d-59f7-59d3-b2a7-497855215baf\n", - "[2023-10-06 17:04:38,456] {make_polygons.py:177} INFO - Generating water body polygons for dataset d4303ecb-3745-530f-8cec-3dafc6bb4db2\n", - "[2023-10-06 17:04:41,031] {make_polygons.py:177} INFO - Generating water body polygons for dataset c5c569f3-8b68-5570-85b5-7e9a1eb6812f\n", - "[2023-10-06 17:04:43,128] {make_polygons.py:177} INFO - Generating water body polygons for dataset 238265f4-9e39-5b2c-848f-f4409ef34b9d\n", - "[2023-10-06 17:04:45,216] {make_polygons.py:177} INFO - Generating water body polygons for dataset d5e0cd25-cb65-53fd-bb0c-745896452731\n", - "[2023-10-06 17:04:48,039] {make_polygons.py:177} INFO - Generating water body polygons for dataset c8fdb171-55c0-5629-b5dd-059aebdc875e\n", - "[2023-10-06 17:04:50,345] {make_polygons.py:177} INFO - Generating water body polygons for dataset 32e41787-3dde-5db9-bee5-c29f0e3a93ed\n", - "[2023-10-06 17:04:52,196] {make_polygons.py:177} INFO - Generating water body polygons for dataset 02c57798-157c-5827-803d-184e0a369d32\n", - "[2023-10-06 17:04:54,057] {make_polygons.py:177} INFO - Generating water body polygons for dataset 8f9b6055-3e72-563f-83e2-1a50970558f5\n", - "[2023-10-06 17:04:55,927] {make_polygons.py:177} INFO - Generating water body polygons for dataset e24f43f8-517d-52c1-9f00-4a83783900bd\n", - "[2023-10-06 17:04:58,825] {make_polygons.py:177} INFO - Generating water body polygons for dataset 6354d0eb-5489-55ed-81cf-b0449f49bea6\n", - "[2023-10-06 17:05:00,898] {make_polygons.py:177} INFO - Generating water body polygons for dataset aea08a1f-9933-5f44-87d3-d0714c52725b\n", - "[2023-10-06 17:05:03,370] {make_polygons.py:177} INFO - Generating water body polygons for dataset f3ffdb67-591d-5da9-9138-817e25487230\n", - "[2023-10-06 17:05:05,596] {make_polygons.py:177} INFO - Generating water body polygons for dataset 36537f53-c09c-5c90-9e5d-e8afcb42f16e\n", - "[2023-10-06 17:05:07,605] {make_polygons.py:177} INFO - Generating water body polygons for dataset 89497671-c686-53da-afce-e6c83afaf3ee\n", - "[2023-10-06 17:05:09,860] {make_polygons.py:177} INFO - Generating water body polygons for dataset d0b13c5b-7421-520f-9493-6e364f19728a\n", - "[2023-10-06 17:05:12,039] {make_polygons.py:177} INFO - Generating water body polygons for dataset 9e9969aa-bbdb-534f-bb77-3d62aeb27223\n", - "[2023-10-06 17:05:14,256] {make_polygons.py:177} INFO - Generating water body polygons for dataset e4ba0b20-10af-58c9-ab42-51a71c2c6dbb\n", - "[2023-10-06 17:05:16,509] {make_polygons.py:177} INFO - Generating water body polygons for dataset a7c4eeae-247c-5668-b248-20497e5e5345\n", - "[2023-10-06 17:05:18,795] {make_polygons.py:177} INFO - Generating water body polygons for dataset e1c2b0df-67fe-5a51-96d3-6a43b1c7dccb\n", - "[2023-10-06 17:05:20,610] {make_polygons.py:177} INFO - Generating water body polygons for dataset 19514ec1-882f-579c-8238-6db6f518a7fc\n", - "[2023-10-06 17:05:24,798] {make_polygons.py:177} INFO - Generating water body polygons for dataset a841d9bf-2afe-5582-aa0a-97803e2689fd\n" - ] - } - ], - "source": [ - "# For each dataset id, threshold the scene to generate the primary and secondary threshold\n", - "# waterbody polygons.\n", - "for dataset_id in dataset_ids:\n", - " primary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{dataset_id}_primary_threshold_polygons.parquet\"\n", - " )\n", - " secondary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{dataset_id}_secondary_threshold_polygons.parquet\"\n", - " )\n", - "\n", - " if not overwrite:\n", - " _log.info(\n", - " f\"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}\"\n", - " )\n", - " exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists(\n", - " secondary_threshold_polygons_fp\n", - " )\n", - "\n", - " if overwrite or not exists:\n", - " (\n", - " primary_threshold_polygons,\n", - " secondary_threshold_polygons,\n", - " ) = get_polygons_from_dataset(\n", - " dataset_id=dataset_id,\n", - " dask_chunks=dask_chunks,\n", - " resolution=resolution,\n", - " output_crs=output_crs,\n", - " min_valid_observations=minimum_valid_observations,\n", - " primary_threshold=primary_threshold,\n", - " secondary_threshold=secondary_threshold,\n", - " dc=dc,\n", - " )\n", - " # Write the polygons to parquet files.\n", - " primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp)\n", - " secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDswithOceanFiltering.ipynb b/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDswithOceanFiltering.ipynb deleted file mode 100644 index 930efe14..00000000 --- a/notebooks/02_GenerateFirstSetofPolygonsusingDatasetIDswithOceanFiltering.ipynb +++ /dev/null @@ -1,368 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "727a0f08-2a9c-471c-bbea-346bf528c856", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import datacube\n", - "import fsspec\n", - "\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import check_dir_exists, check_file_exists, check_if_s3_uri\n", - "from deafrica_waterbodies.make_polygons import check_wetness_thresholds, get_polygons_from_dataset_with_land_sea_mask_filtering" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "25b37fcb-eb02-4989-8794-326ed93a37b1", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "60a87b0c-6a5b-4405-91fe-64c2fe4c5cc0", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "primary_threshold: float = 0.1\n", - "secondary_threshold: float = 0.05\n", - "minimum_valid_observations: int = 128\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2\"\n", - "dataset_ids_text_file = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt\"\n", - "overwrite = True\n", - "land_sea_mask_fp = \"data/af_msk_3s.tif\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4f5f0c14-595a-4a29-beb6-795e395e85ca", - "metadata": {}, - "outputs": [], - "source": [ - "import xarray as xr\n", - "def filter_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray:\n", - " \"\"\"\n", - " Function to filter the HydroSHEDs Land Mask into a boolean mask.\n", - " \"\"\"\n", - " # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.\n", - " boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2)\n", - " return boolean_mask" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "906996f9-638a-4937-a3e4-7c561a472192", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "cdefc70d-f19d-4a9a-80a6-6810c2eef674", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib paths.\n", - "output_directory = str(output_directory)\n", - "dataset_ids_text_file = str(dataset_ids_text_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d87d2e53-fe36-43f6-81c3-45495c4d7a49", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters to use when loading datasets.\n", - "dask_chunks = {\"x\": 3200, \"y\": 3200, \"time\": 1}\n", - "resolution = (-30, 30)\n", - "output_crs = \"EPSG:6933\"" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "3c0a4fca-9707-44ef-812d-bf8a1f42308f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:30:01,977] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n" - ] - } - ], - "source": [ - "# Read the dataset ids from the text file.\n", - "if not check_file_exists(dataset_ids_text_file):\n", - " _log.error(f\"Could not find text file {dataset_ids_text_file}!\")\n", - " raise FileNotFoundError(f\"Could not find text file {dataset_ids_text_file}!\")\n", - "else:\n", - " if check_if_s3_uri(dataset_ids_text_file):\n", - " fs = fsspec.filesystem(\"s3\")\n", - " else:\n", - " fs = fsspec.filesystem(\"file\")\n", - " with fs.open(dataset_ids_text_file, \"r\") as file:\n", - " lines = file.readlines()\n", - " dataset_ids = [line.strip() for line in lines]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "20679b58-4de2-4eea-9ab3-85728e6aa896", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory to write generated waterbody polygons to.\n", - "polygons_from_thresholds_dir = os.path.join(output_directory, \"polygons_from_thresholds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "af2a667f-70fd-43f9-90b9-49a84f39f30c", - "metadata": {}, - "outputs": [], - "source": [ - "# Set the filesystem to use.\n", - "if check_if_s3_uri(polygons_from_thresholds_dir):\n", - " fs = fsspec.filesystem(\"s3\")\n", - "else:\n", - " fs = fsspec.filesystem(\"file\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d42cefd7-508c-4fdc-aa22-49e9ee5751ea", - "metadata": {}, - "outputs": [], - "source": [ - "# Check if the directory exists. If it does not, create it.\n", - "if not check_dir_exists(polygons_from_thresholds_dir):\n", - " fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)\n", - " _log.info(f\"Created directory {polygons_from_thresholds_dir}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "9f9fec1f-e5df-4190-aaae-910e68acbcd1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:30:02,303] {3616587049.py:3} INFO - We will be running a hybrid wetness threshold. \n", - "**You have set 0.1 as the primary threshold, which will define the location of the waterbody polygons \n", - " with 0.05 set as the supplementary threshold, which will define the extent/shape of the waterbody polygons.**\n" - ] - } - ], - "source": [ - "# Check if the wetness thresholds have been set correctly.\n", - "minimum_wet_thresholds = [secondary_threshold, primary_threshold]\n", - "_log.info(check_wetness_thresholds(minimum_wet_thresholds))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a86b5866-6b56-4a6b-919b-20e11334ad6f", - "metadata": {}, - "outputs": [], - "source": [ - "# Connect to the datacube.\n", - "dc = datacube.Datacube(app=\"GenerateWaterbodyPolygons\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c0dd33a2-9726-471d-a245-8564bafb594b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:30:02,717] {make_polygons.py:319} INFO - Generating water body polygons for dataset cd198bae-43a1-566a-8e8e-01b110bfbaf5\n", - "[2023-10-06 18:30:05,898] {make_polygons.py:319} INFO - Generating water body polygons for dataset fc10a5ae-00d0-5998-bbc0-b7d29f5807fb\n", - "[2023-10-06 18:30:08,574] {make_polygons.py:319} INFO - Generating water body polygons for dataset bb6c330e-f7c9-5164-85a6-a10e5ed36ce8\n", - "[2023-10-06 18:30:11,821] {make_polygons.py:319} INFO - Generating water body polygons for dataset 3180edab-0678-59a6-9cce-70437f6d8e8b\n", - "[2023-10-06 18:30:14,931] {make_polygons.py:319} INFO - Generating water body polygons for dataset f6d24d9a-4399-5d5e-9a0b-b4edfcea710f\n", - "[2023-10-06 18:30:17,925] {make_polygons.py:319} INFO - Generating water body polygons for dataset 68180140-e074-5c12-a5e7-8ed3d0dee5a9\n", - "[2023-10-06 18:30:21,107] {make_polygons.py:319} INFO - Generating water body polygons for dataset 7e5d3cf6-2dd0-5830-b6dc-39c1ecc69713\n", - "[2023-10-06 18:30:23,921] {make_polygons.py:319} INFO - Generating water body polygons for dataset 5c4cf4b6-c649-5205-8152-1b1b8797ef4c\n", - "[2023-10-06 18:30:27,216] {make_polygons.py:319} INFO - Generating water body polygons for dataset 1e8d81b3-241f-507b-bc0b-6accf8fb995d\n", - "[2023-10-06 18:30:30,186] {make_polygons.py:319} INFO - Generating water body polygons for dataset 2a070692-152f-5bce-9746-304ae9f473e6\n", - "[2023-10-06 18:30:33,196] {make_polygons.py:319} INFO - Generating water body polygons for dataset 84e76979-16bb-5a28-8846-59f50b97a6bf\n", - "[2023-10-06 18:30:35,922] {make_polygons.py:319} INFO - Generating water body polygons for dataset b886a2e3-6342-564c-8ded-06cf45a5a115\n", - "[2023-10-06 18:30:39,514] {make_polygons.py:319} INFO - Generating water body polygons for dataset 66c72025-c94c-5a82-bea8-e0cf8edb306e\n", - "[2023-10-06 18:30:44,370] {make_polygons.py:319} INFO - Generating water body polygons for dataset 832547b8-3409-5be3-a9b2-0cfb9e79d43b\n", - "[2023-10-06 18:30:47,063] {make_polygons.py:319} INFO - Generating water body polygons for dataset a55e2595-a9f8-5513-90df-ebeb9141371f\n", - "[2023-10-06 18:30:49,929] {make_polygons.py:319} INFO - Generating water body polygons for dataset 4aaef995-885e-55a2-9c0e-4cecf9d0c3f4\n", - "[2023-10-06 18:30:52,927] {make_polygons.py:319} INFO - Generating water body polygons for dataset 12688939-1640-5fef-8155-fedfe3ecd30b\n", - "[2023-10-06 18:30:56,216] {make_polygons.py:319} INFO - Generating water body polygons for dataset b6cb8e82-1879-5706-b93c-ef85f032f470\n", - "[2023-10-06 18:30:59,742] {make_polygons.py:319} INFO - Generating water body polygons for dataset d31ecb6f-6459-5659-a841-f7a84c824af9\n", - "[2023-10-06 18:31:02,690] {make_polygons.py:319} INFO - Generating water body polygons for dataset 992cf08a-0750-5db6-b409-2fe141b84ec2\n", - "[2023-10-06 18:31:05,481] {make_polygons.py:319} INFO - Generating water body polygons for dataset ef01bdfe-abc6-5618-a779-2489df1d5d73\n", - "[2023-10-06 18:31:08,888] {make_polygons.py:319} INFO - Generating water body polygons for dataset f191276a-d1a6-50ac-83a9-de5b279b4229\n", - "[2023-10-06 18:31:13,729] {make_polygons.py:319} INFO - Generating water body polygons for dataset 217fc46c-d098-5639-ad0a-b8d0efbc4276\n", - "[2023-10-06 18:31:29,149] {make_polygons.py:319} INFO - Generating water body polygons for dataset 87916cb1-d351-529c-a42a-07eff309ed8b\n", - "[2023-10-06 18:31:31,774] {make_polygons.py:319} INFO - Generating water body polygons for dataset a27df8f9-e8a5-5ab9-a962-18faf3ce3ae2\n", - "[2023-10-06 18:31:34,932] {make_polygons.py:319} INFO - Generating water body polygons for dataset 74781945-fca1-5be1-a86c-776a1770d8e3\n", - "[2023-10-06 18:31:37,572] {make_polygons.py:319} INFO - Generating water body polygons for dataset d49e6bfd-d0f1-5df8-9da4-ec198150f402\n", - "[2023-10-06 18:31:40,922] {make_polygons.py:319} INFO - Generating water body polygons for dataset b5997e76-bce0-5a43-ac14-5c5ba69ae65a\n", - "[2023-10-06 18:31:47,930] {make_polygons.py:319} INFO - Generating water body polygons for dataset 52f8f266-f8da-5269-aed6-007e26c131e0\n", - "[2023-10-06 18:31:51,273] {make_polygons.py:319} INFO - Generating water body polygons for dataset d72ef170-3533-518b-90ce-8debfc37aedb\n", - "[2023-10-06 18:32:02,369] {make_polygons.py:319} INFO - Generating water body polygons for dataset e413b8ed-5eb2-591f-9a37-8b7825d37dcf\n", - "[2023-10-06 18:32:05,462] {make_polygons.py:319} INFO - Generating water body polygons for dataset 1dcf68e4-01b5-55d1-9b74-0162af9968fa\n", - "[2023-10-06 18:32:08,687] {make_polygons.py:319} INFO - Generating water body polygons for dataset 7762c98a-80d0-548a-ac06-39f763e25c35\n", - "[2023-10-06 18:32:11,754] {make_polygons.py:319} INFO - Generating water body polygons for dataset 2d927c3d-b259-574f-8a88-983e9c835bd5\n", - "[2023-10-06 18:32:14,922] {make_polygons.py:319} INFO - Generating water body polygons for dataset 11abb94d-e3e8-5927-beeb-b5c6fe2f471f\n", - "[2023-10-06 18:32:18,946] {make_polygons.py:319} INFO - Generating water body polygons for dataset 55016bd8-ada1-5327-96eb-b68848f800b3\n", - "[2023-10-06 18:32:21,992] {make_polygons.py:319} INFO - Generating water body polygons for dataset 818adac5-2349-5e72-95f9-34d1ab908668\n", - "[2023-10-06 18:32:25,290] {make_polygons.py:319} INFO - Generating water body polygons for dataset 506157af-33c8-5596-9176-c86db467880a\n", - "[2023-10-06 18:32:28,515] {make_polygons.py:319} INFO - Generating water body polygons for dataset 89ade063-61c0-568b-a8fc-7a641319a1ce\n", - "[2023-10-06 18:32:32,111] {make_polygons.py:319} INFO - Generating water body polygons for dataset 92d4584f-574b-57bb-be2b-4e08291c1796\n", - "[2023-10-06 18:32:39,173] {make_polygons.py:319} INFO - Generating water body polygons for dataset ee8c6eca-a29b-5449-8896-289436371bb4\n", - "[2023-10-06 18:32:42,814] {make_polygons.py:319} INFO - Generating water body polygons for dataset 41751ede-7517-578d-aca7-c2d1694c31aa\n", - "[2023-10-06 18:32:46,334] {make_polygons.py:319} INFO - Generating water body polygons for dataset 28435ed4-8593-5fbb-9f01-87a68c6ca593\n", - "[2023-10-06 18:32:49,374] {make_polygons.py:319} INFO - Generating water body polygons for dataset 9ba989f3-b68b-5027-b6e8-aefc20721dea\n", - "[2023-10-06 18:32:52,680] {make_polygons.py:319} INFO - Generating water body polygons for dataset 62c2b19d-7ea4-50c3-b1b6-067d42eb2638\n", - "[2023-10-06 18:32:55,665] {make_polygons.py:319} INFO - Generating water body polygons for dataset 9ee761e1-189e-53b7-81a3-6f0be8c98934\n", - "[2023-10-06 18:33:01,858] {make_polygons.py:319} INFO - Generating water body polygons for dataset 6c8c71d3-0c01-52b9-a208-f96dd673971d\n", - "[2023-10-06 18:33:05,836] {make_polygons.py:319} INFO - Generating water body polygons for dataset 491be0b7-aac0-5aae-8585-c32201050149\n", - "[2023-10-06 18:33:09,031] {make_polygons.py:319} INFO - Generating water body polygons for dataset f1b1b0cb-d8b8-5d13-9c95-161c60a9bfe6\n", - "[2023-10-06 18:33:11,993] {make_polygons.py:319} INFO - Generating water body polygons for dataset 3ad86dbf-4c1d-515a-8e02-2ba7664edcfc\n", - "[2023-10-06 18:33:15,117] {make_polygons.py:319} INFO - Generating water body polygons for dataset 3be3887b-db05-527f-91c3-c9f2d201460e\n", - "[2023-10-06 18:33:18,173] {make_polygons.py:319} INFO - Generating water body polygons for dataset 5c5a792d-7129-5345-89b8-ee159eeb8357\n", - "[2023-10-06 18:33:21,749] {make_polygons.py:319} INFO - Generating water body polygons for dataset dbce025d-59f7-59d3-b2a7-497855215baf\n", - "[2023-10-06 18:33:24,964] {make_polygons.py:319} INFO - Generating water body polygons for dataset d4303ecb-3745-530f-8cec-3dafc6bb4db2\n", - "[2023-10-06 18:33:28,524] {make_polygons.py:319} INFO - Generating water body polygons for dataset c5c569f3-8b68-5570-85b5-7e9a1eb6812f\n", - "[2023-10-06 18:33:31,777] {make_polygons.py:319} INFO - Generating water body polygons for dataset 238265f4-9e39-5b2c-848f-f4409ef34b9d\n", - "[2023-10-06 18:33:34,986] {make_polygons.py:319} INFO - Generating water body polygons for dataset d5e0cd25-cb65-53fd-bb0c-745896452731\n", - "[2023-10-06 18:33:39,256] {make_polygons.py:319} INFO - Generating water body polygons for dataset c8fdb171-55c0-5629-b5dd-059aebdc875e\n", - "[2023-10-06 18:33:42,539] {make_polygons.py:319} INFO - Generating water body polygons for dataset 32e41787-3dde-5db9-bee5-c29f0e3a93ed\n", - "[2023-10-06 18:33:45,468] {make_polygons.py:319} INFO - Generating water body polygons for dataset 02c57798-157c-5827-803d-184e0a369d32\n", - "[2023-10-06 18:33:48,420] {make_polygons.py:319} INFO - Generating water body polygons for dataset 8f9b6055-3e72-563f-83e2-1a50970558f5\n", - "[2023-10-06 18:33:51,261] {make_polygons.py:319} INFO - Generating water body polygons for dataset e24f43f8-517d-52c1-9f00-4a83783900bd\n", - "[2023-10-06 18:33:55,008] {make_polygons.py:319} INFO - Generating water body polygons for dataset 6354d0eb-5489-55ed-81cf-b0449f49bea6\n", - "[2023-10-06 18:33:58,276] {make_polygons.py:319} INFO - Generating water body polygons for dataset aea08a1f-9933-5f44-87d3-d0714c52725b\n", - "[2023-10-06 18:34:01,704] {make_polygons.py:319} INFO - Generating water body polygons for dataset f3ffdb67-591d-5da9-9138-817e25487230\n", - "[2023-10-06 18:34:04,962] {make_polygons.py:319} INFO - Generating water body polygons for dataset 36537f53-c09c-5c90-9e5d-e8afcb42f16e\n", - "[2023-10-06 18:34:08,035] {make_polygons.py:319} INFO - Generating water body polygons for dataset 89497671-c686-53da-afce-e6c83afaf3ee\n", - "[2023-10-06 18:34:11,349] {make_polygons.py:319} INFO - Generating water body polygons for dataset d0b13c5b-7421-520f-9493-6e364f19728a\n", - "[2023-10-06 18:34:14,570] {make_polygons.py:319} INFO - Generating water body polygons for dataset 9e9969aa-bbdb-534f-bb77-3d62aeb27223\n", - "[2023-10-06 18:34:17,608] {make_polygons.py:319} INFO - Generating water body polygons for dataset e4ba0b20-10af-58c9-ab42-51a71c2c6dbb\n", - "[2023-10-06 18:34:20,852] {make_polygons.py:319} INFO - Generating water body polygons for dataset a7c4eeae-247c-5668-b248-20497e5e5345\n", - "[2023-10-06 18:34:24,336] {make_polygons.py:319} INFO - Generating water body polygons for dataset e1c2b0df-67fe-5a51-96d3-6a43b1c7dccb\n", - "[2023-10-06 18:34:27,390] {make_polygons.py:319} INFO - Generating water body polygons for dataset 19514ec1-882f-579c-8238-6db6f518a7fc\n", - "[2023-10-06 18:34:32,766] {make_polygons.py:319} INFO - Generating water body polygons for dataset a841d9bf-2afe-5582-aa0a-97803e2689fd\n" - ] - } - ], - "source": [ - "# For each dataset id, threshold the scene to generate the primary and secondary threshold\n", - "# waterbody polygons.\n", - "for dataset_id in dataset_ids:\n", - " primary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{dataset_id}_primary_threshold_polygons.parquet\"\n", - " )\n", - " secondary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{dataset_id}_secondary_threshold_polygons.parquet\"\n", - " )\n", - "\n", - " if not overwrite:\n", - " _log.info(\n", - " f\"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}\"\n", - " )\n", - " exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists(\n", - " secondary_threshold_polygons_fp\n", - " )\n", - "\n", - " if overwrite or not exists:\n", - " (\n", - " primary_threshold_polygons,\n", - " secondary_threshold_polygons,\n", - " ) = get_polygons_from_dataset_with_land_sea_mask_filtering(\n", - " dataset_id=dataset_id,\n", - " dask_chunks=dask_chunks,\n", - " resolution=resolution,\n", - " output_crs=output_crs,\n", - " min_valid_observations=minimum_valid_observations,\n", - " primary_threshold=primary_threshold,\n", - " secondary_threshold=secondary_threshold,\n", - " dc=dc,\n", - " land_sea_mask_fp=land_sea_mask_fp,\n", - " filter_land_sea_mask=filter_hydrosheds_land_mask\n", - " )\n", - " # Write the polygons to parquet files.\n", - " primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp)\n", - " secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/03_MergePolygonsAtTileBoundaries.ipynb b/notebooks/03_MergePolygonsAtTileBoundaries.ipynb deleted file mode 100644 index 28180963..00000000 --- a/notebooks/03_MergePolygonsAtTileBoundaries.ipynb +++ /dev/null @@ -1,303 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "26660d9f-cd3c-4d3c-903e-4180eaec0532", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import geopandas as gpd\n", - "import pandas as pd\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import find_parquet_files\n", - "from deafrica_waterbodies.make_polygons import merge_polygons_at_dataset_boundaries" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "6b561b0c-0d09-4114-a788-f7a6813b69ed", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d8a96c75-bf14-49c5-a633-5f24cd7ec93f", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a7621524-43f3-4c9f-ab47-91c109d5e96c", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d930428c-fa94-46d3-b829-43263dc25e72", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib paths.\n", - "output_directory = str(output_directory)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b64b8f71-6a9d-495a-adb0-7d146df5a0c9", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory containing the water body polygons generated from\n", - "# thresholding WOfS All time summary datasets.\n", - "polygons_from_thresholds_dir = os.path.join(output_directory, \"polygons_from_thresholds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d199cc4c-5d68-4480-9490-d3d46f34fd1b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:36,671] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n", - "[2023-10-06 18:46:36,869] {2643269420.py:3} INFO - Found 74 parquet files for the primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the primary threshold.\n", - "primary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*primary.*\")\n", - "_log.info(f\"Found {len(primary_threshold_polygons_paths)} parquet files for the primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4db44721-ab19-487b-9ede-cb8fb36b986d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:36,874] {370454188.py:2} INFO - Loading the primary threshold polygons parquet files..\n", - "[2023-10-06 18:46:43,989] {370454188.py:9} INFO - Found 42891 primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the primary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the primary threshold polygons parquet files..\")\n", - "primary_threshold_polygons_list = []\n", - "for path in primary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " primary_threshold_polygons_list.append(gdf)\n", - "\n", - "primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(primary_threshold_polygons)} primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "700846f1-d3f3-4add-93a3-5245879a7412", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:43,994] {592751505.py:1} INFO - Merging primary threshold waterbody polygons located at dataset/scene boundaries...\n", - "[2023-10-06 18:46:48,837] {592751505.py:3} INFO - Primary threshold polygons count 42750.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging primary threshold waterbody polygons located at dataset/scene boundaries...\")\n", - "primary_threshold_polygons_merged = merge_polygons_at_dataset_boundaries(primary_threshold_polygons)\n", - "_log.info(f\"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "80ff255c-48fd-47e9-9304-35ca88971bf7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:48,843] {2868347883.py:1} INFO - Writing primary threshold polygons merged at dataset boundaries to disk..\n", - "[2023-10-06 18:46:49,192] {2868347883.py:7} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/primary_threshold_polygons_merged_at_ds_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing primary threshold polygons merged at dataset boundaries to disk..\")\n", - "primary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"primary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "\n", - "primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp)\n", - "_log.info(f\"Polygons written to {primary_threshold_polygons_output_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3cc50c6c-aeab-4322-8192-422e66b7999f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:49,257] {1989755468.py:3} INFO - Found 74 parquet files for the secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the secondary threshold.\n", - "secondary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*secondary.*\")\n", - "_log.info(f\"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "c36e5f38-72f4-49d0-bad8-d8ce70491ff5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:49,263] {4278796675.py:2} INFO - Loading the secondary threshold polygons parquet files...\n", - "[2023-10-06 18:46:56,650] {4278796675.py:9} INFO - Found 81839 secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the secondary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the secondary threshold polygons parquet files...\")\n", - "secondary_threshold_polygons_list = []\n", - "for path in secondary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " secondary_threshold_polygons_list.append(gdf)\n", - "\n", - "secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(secondary_threshold_polygons)} secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f3586fd3-7f27-4574-8af8-2a689b7f25c7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:46:56,655] {4276071991.py:1} INFO - Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\n", - "[2023-10-06 18:47:07,515] {4276071991.py:3} INFO - Secondary threshold polygons count 81635.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\")\n", - "secondary_threshold_polygons_merged = merge_polygons_at_dataset_boundaries(secondary_threshold_polygons)\n", - "_log.info(f\"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "7e0dd3b6-c6ef-42e9-aa48-857b7efe69cf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:07,520] {3813904110.py:1} INFO - Writing secondary threshold polygons merged at dataset boundaries to disk..\n", - "[2023-10-06 18:47:08,095] {3813904110.py:8} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/secondary_threshold_polygons_merged_at_ds_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing secondary threshold polygons merged at dataset boundaries to disk..\")\n", - "secondary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"secondary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "\n", - "secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp)\n", - "\n", - "_log.info(f\"Polygons written to {secondary_threshold_polygons_output_fp}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/04_ApplyFilters.ipynb b/notebooks/04_ApplyFilters.ipynb deleted file mode 100644 index 8c8ad449..00000000 --- a/notebooks/04_ApplyFilters.ipynb +++ /dev/null @@ -1,406 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "cdabd764-0960-4bcd-84f5-e007852c3869", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import math\n", - "import os\n", - "\n", - "import click\n", - "import geopandas as gpd\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.filters import (\n", - " filter_by_area,\n", - " filter_using_land_sea_mask,\n", - " filter_using_major_rivers_mask,\n", - " filter_using_urban_mask,\n", - " merge_primary_and_secondary_threshold_polygons,\n", - " split_large_polygons,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3af67100-ff19-440f-b6e5-fe5724244ee5", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7acc8246-b844-4e92-bc15-204bb54702e8", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2\"\n", - "min_polygon_size = 4500 # 5 pixels\n", - "max_polygon_size = math.inf\n", - "land_sea_mask_fp = \"\"\n", - "major_rivers_mask_fp = \"\"\n", - "urban_mask_fp = \"\"\n", - "handle_large_polygons = \"nothing\"\n", - "pp_test_threshold = 0.005" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "32a30849-c86a-4b8a-947c-cc12245d618f", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1670f7c9-b5bf-4c41-b1c7-150da3728985", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib paths.\n", - "output_directory = str(output_directory)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e1a0e9e0-2af9-473c-bdd2-65b4d64032f2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:20,768] {7015612.py:2} INFO - Loading primary and secondary threshold polygons...\n", - "[2023-10-06 18:47:21,486] {7015612.py:13} INFO - Primary threshold polygons count 42750.\n", - "[2023-10-06 18:47:21,487] {7015612.py:14} INFO - Secondary threshold polygons count 81635.\n" - ] - } - ], - "source": [ - "# Load the primary and secondary threshold polygons\n", - "_log.info(\"Loading primary and secondary threshold polygons...\")\n", - "\n", - "primary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"primary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "secondary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"secondary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "primary_threshold_polygons = gpd.read_parquet(primary_threshold_polygons_fp)\n", - "secondary_threshold_polygons = gpd.read_parquet(secondary_threshold_polygons_fp)\n", - "\n", - "_log.info(f\"Primary threshold polygons count {len(primary_threshold_polygons)}.\")\n", - "_log.info(f\"Secondary threshold polygons count {len(secondary_threshold_polygons)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "8d08d600-b0ab-41c8-a2fa-43a6015609d5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:21,493] {filters.py:127} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...\n", - "[2023-10-06 18:47:21,506] {filters.py:139} INFO - Filtered out 31040 primary threshold polygons.\n", - "[2023-10-06 18:47:21,506] {filters.py:146} INFO - Filtering secondary threshold polygons by max area inf...\n", - "[2023-10-06 18:47:21,530] {filters.py:153} INFO - Filtered out 0 secondary threshold polygons.\n", - "[2023-10-06 18:47:21,837] {87560456.py:15} INFO - Area filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_primary_threshold_polygons.parquet\n", - "[2023-10-06 18:47:22,393] {87560456.py:21} INFO - Area filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_secondary_threshold_polygons.parquet\n" - ] - } - ], - "source": [ - "(\n", - " area_filtered_primary_threshold_polygons,\n", - " area_filtered_secondary_threshold_polygons,\n", - ") = filter_by_area(\n", - " primary_threshold_polygons=primary_threshold_polygons,\n", - " secondary_threshold_polygons=secondary_threshold_polygons,\n", - " min_polygon_size=min_polygon_size,\n", - " max_polygon_size=max_polygon_size,\n", - ")\n", - "\n", - "area_filtered_primary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"area_filtered_primary_threshold_polygons.parquet\"\n", - ")\n", - "area_filtered_primary_threshold_polygons.to_parquet(area_filtered_primary_threshold_polygons_fp)\n", - "_log.info(f\"Area filtered primary threshold polygons written to {area_filtered_primary_threshold_polygons_fp}\")\n", - "\n", - "area_filtered_secondary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"area_filtered_secondary_threshold_polygons.parquet\"\n", - ")\n", - "area_filtered_secondary_threshold_polygons.to_parquet(area_filtered_secondary_threshold_polygons_fp)\n", - "_log.info(f\"Area filtered secondary threshold polygons written to {area_filtered_secondary_threshold_polygons_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "aa18c51c-3e1a-44a9-80ce-17fcce0422fe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:22,399] {filters.py:225} INFO - Skipping filtering out ocean polygons step.\n", - "[2023-10-06 18:47:22,743] {2722900615.py:12} INFO - Ocean filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/inland_primary_threshold_polygons.parquet\n", - "[2023-10-06 18:47:23,347] {2722900615.py:16} INFO - Ocean filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/inland_secondary_threshold_polygons.parquet\n" - ] - } - ], - "source": [ - "(\n", - " inland_primary_threshold_polygons,\n", - " inland_secondary_threshold_polygons,\n", - ") = filter_using_land_sea_mask(\n", - " primary_threshold_polygons=area_filtered_primary_threshold_polygons,\n", - " secondary_threshold_polygons=area_filtered_secondary_threshold_polygons,\n", - " land_sea_mask_fp=land_sea_mask_fp,\n", - ")\n", - "\n", - "inland_primary_threshold_polygons_fp = os.path.join(output_directory, \"inland_primary_threshold_polygons.parquet\")\n", - "inland_primary_threshold_polygons.to_parquet(inland_primary_threshold_polygons_fp)\n", - "_log.info(f\"Ocean filtered primary threshold polygons written to {inland_primary_threshold_polygons_fp}\")\n", - "\n", - "inland_secondary_threshold_polygons_fp = os.path.join(output_directory, \"inland_secondary_threshold_polygons.parquet\")\n", - "inland_secondary_threshold_polygons.to_parquet(inland_secondary_threshold_polygons_fp)\n", - "_log.info(f\"Ocean filtered secondary threshold polygons written to {inland_secondary_threshold_polygons_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3f52b2bf-5fda-4eec-bddd-9603fb3775dc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:23,353] {filters.py:294} INFO - Skipping filtering out CBDs step.\n", - "[2023-10-06 18:47:23,662] {2340062003.py:15} INFO - CBDs filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/cbd_filtered_primary_threshold_polygons.parquet\n", - "[2023-10-06 18:47:24,170] {2340062003.py:21} INFO - CBDs filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/cbd_filtered_secondary_threshold_polygons.parquet\n" - ] - } - ], - "source": [ - "(\n", - " cbd_filtered_primary_threshold_polygons,\n", - " cbd_filtered_secondary_threshold_polygons,\n", - ") = filter_using_urban_mask(\n", - " primary_threshold_polygons=inland_primary_threshold_polygons,\n", - " secondary_threshold_polygons=inland_secondary_threshold_polygons,\n", - " urban_mask_fp=urban_mask_fp,\n", - ")\n", - "\n", - "\n", - "cbd_filtered_primary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"cbd_filtered_primary_threshold_polygons.parquet\"\n", - ")\n", - "cbd_filtered_primary_threshold_polygons.to_parquet(cbd_filtered_primary_threshold_polygons_fp)\n", - "_log.info(f\"CBDs filtered primary threshold polygons written to {cbd_filtered_primary_threshold_polygons_fp}\")\n", - "\n", - "cbd_filtered_secondary_threshold_polygons_fp = os.path.join(\n", - " output_directory, \"cbd_filtered_secondary_threshold_polygons.parquet\"\n", - ")\n", - "cbd_filtered_secondary_threshold_polygons.to_parquet(cbd_filtered_secondary_threshold_polygons_fp)\n", - "_log.info(f\"CBDs filtered secondary threshold polygons written to {cbd_filtered_secondary_threshold_polygons_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "d65e3f21-228c-4856-b065-50157421bce0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:47:24,176] {filters.py:319} INFO - Merging the primary threshold and secondary threshold polygons...\n", - "[2023-10-06 18:49:10,808] {filters.py:340} INFO - Waterbody polygons count after merge: 7164.\n", - "[2023-10-06 18:49:11,201] {:9} INFO - Merged waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/merged_polygons.parquet\n", - "CPU times: user 1min 46s, sys: 463 ms, total: 1min 46s\n", - "Wall time: 1min 47s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Merge the primary and secondary threshold polygons.\n", - "merged_polygons = merge_primary_and_secondary_threshold_polygons(\n", - " primary_threshold_polygons=cbd_filtered_primary_threshold_polygons,\n", - " secondary_threshold_polygons=cbd_filtered_secondary_threshold_polygons,\n", - ")\n", - "\n", - "merged_polygons_fp = os.path.join(output_directory, \"merged_polygons.parquet\")\n", - "merged_polygons.to_parquet(merged_polygons_fp)\n", - "_log.info(f\"Merged waterbody polygons written to {merged_polygons_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a2c4dc2a-e6d6-46f2-becf-912802d68344", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:49:11,209] {filters.py:384} INFO - Skipping filtering out major rivers polygons step.\n", - "[2023-10-06 18:49:11,615] {1532569761.py:7} INFO - Major rivers filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/major_rivers_filtered_polygons.parquet\n" - ] - } - ], - "source": [ - "major_rivers_filtered_polygons = filter_using_major_rivers_mask(\n", - " waterbody_polygons=merged_polygons, major_rivers_mask_fp=major_rivers_mask_fp\n", - ")\n", - "\n", - "major_rivers_filtered_polygons_fp = os.path.join(output_directory, \"major_rivers_filtered_polygons.parquet\")\n", - "major_rivers_filtered_polygons.to_parquet(major_rivers_filtered_polygons_fp)\n", - "_log.info(f\"Major rivers filtered polygons written to {major_rivers_filtered_polygons_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "c2f56b2e-b820-41cb-a8ef-5ba96291d15a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:49:11,639] {filters.py:458} INFO - You have chosen not to split large polygons. If you meant to use this option, please select one of the following methods: ['erode-dilate-v1', 'erode-dilate-v2'].\n", - "[2023-10-06 18:49:11,640] {2125941619.py:5} INFO - Waterbody polygons count after splitting large polygons 7164.\n", - "[2023-10-06 18:49:12,032] {2125941619.py:9} INFO - Waterbodies with large polygons handled written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/large_polygons_handled_nothing.parquet\n" - ] - } - ], - "source": [ - "# Handle large polygons.\n", - "large_polygons_handled = split_large_polygons(\n", - " waterbody_polygons=major_rivers_filtered_polygons, pp_thresh=pp_test_threshold, method=handle_large_polygons\n", - ")\n", - "_log.info(f\"Waterbody polygons count after splitting large polygons {len(large_polygons_handled)}.\")\n", - "\n", - "large_polygons_handled_fp = os.path.join(output_directory, f\"large_polygons_handled_{handle_large_polygons}.parquet\")\n", - "large_polygons_handled.to_parquet(large_polygons_handled_fp)\n", - "_log.info(f\"Waterbodies with large polygons handled written to {large_polygons_handled_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f08e4da4-ba78-48bb-85c6-c4e044cc5f14", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:49:12,042] {filters.py:127} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...\n", - "[2023-10-06 18:49:12,054] {filters.py:139} INFO - Filtered out 621 primary threshold polygons.\n", - "[2023-10-06 18:49:12,466] {1618256413.py:12} INFO - Area filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_large_polygons_handled.parquet\n" - ] - } - ], - "source": [ - "# Reapply the size filtering, just to check that all of the split and filtered waterbodies are\n", - "# still in the size range we want.\n", - "area_filtered_large_polygons_handled, _ = filter_by_area(\n", - " primary_threshold_polygons=large_polygons_handled,\n", - " secondary_threshold_polygons=None,\n", - " min_polygon_size=min_polygon_size,\n", - " max_polygon_size=max_polygon_size,\n", - ")\n", - "\n", - "area_filtered_large_polygons_handled_fp = os.path.join(output_directory, \"area_filtered_large_polygons_handled.parquet\")\n", - "area_filtered_large_polygons_handled.to_parquet(area_filtered_large_polygons_handled_fp)\n", - "_log.info(f\"Area filtered polygons written to {area_filtered_large_polygons_handled_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "f47fd892-24d0-491b-873e-05ebefb3014d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:49:12,873] {1197084677.py:7} INFO - Filtered waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/filtered_polygons.parquet\n" - ] - } - ], - "source": [ - "# Return a GeoDataFrame with the geometry column only.\n", - "filtered_polygons = gpd.GeoDataFrame(\n", - " geometry=area_filtered_large_polygons_handled[\"geometry\"], crs=area_filtered_large_polygons_handled.crs\n", - ")\n", - "filtered_polygons_fp = os.path.join(output_directory, \"filtered_polygons.parquet\")\n", - "filtered_polygons.to_parquet(filtered_polygons_fp)\n", - "_log.info(f\"Filtered waterbody polygons written to {filtered_polygons_fp}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/05_WriteFinalWaterbodies.ipynb b/notebooks/05_WriteFinalWaterbodies.ipynb deleted file mode 100644 index 7814a559..00000000 --- a/notebooks/05_WriteFinalWaterbodies.ipynb +++ /dev/null @@ -1,190 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "41ab3fc2-410a-4d68-b108-de4105a606b8", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import geopandas as gpd\n", - "\n", - "from deafrica_waterbodies.attributes import (\n", - " add_area_and_perimeter_attributes,\n", - " add_timeseries_attribute,\n", - " assign_unique_ids,\n", - ")\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import write_waterbodies_to_file" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f84dce1e-c5a4-4c38-bad6-a4c97efaf64b", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4da721f5-a525-48b5-ba43-35ad774c0e15", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2\"\n", - "product_version = \"0.0.1\"\n", - "timeseries_bucket = \"deafrica-waterbodies-dev\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2f860555-2a8d-461d-87bc-fe82f5dfd431", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "09a1670d-8ee1-4bf3-a7dd-36ef052f8ad4", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib paths.\n", - "output_directory = str(output_directory)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4a163bc8-097a-4719-9aeb-4c5cd0f5f5e1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:54:46,570] {1292984903.py:1} INFO - Loading filtered waterbody polygons...\n", - "[2023-10-06 18:54:46,986] {1292984903.py:4} INFO - Waterbody polygons count 6543.\n" - ] - } - ], - "source": [ - "_log.info(\"Loading filtered waterbody polygons...\")\n", - "filtered_polygons_fp = os.path.join(output_directory, \"filtered_polygons.parquet\")\n", - "filtered_polygons = gpd.read_parquet(filtered_polygons_fp)\n", - "_log.info(f\"Waterbody polygons count {len(filtered_polygons)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3ee1f7d1-fc1a-48d1-9458-921ff812359d", - "metadata": {}, - "outputs": [], - "source": [ - "waterbodies_gdf = assign_unique_ids(polygons=filtered_polygons)\n", - "waterbodies_gdf = add_area_and_perimeter_attributes(polygons=waterbodies_gdf)\n", - "waterbodies_gdf = add_timeseries_attribute(\n", - " polygons=waterbodies_gdf,\n", - " product_version=product_version,\n", - " timeseries_bucket=timeseries_bucket,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1aede9ef-3a24-4b89-bcab-f247f6eb66a2", - "metadata": {}, - "outputs": [], - "source": [ - "# Reproject to EPSG:4326\n", - "waterbodies_gdf_4326 = waterbodies_gdf.to_crs(\"EPSG:4326\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c2363caf-d318-4dcf-a50c-9e995503ad43", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 18:54:48,828] {collection.py:558} WARNING - Value 201189599.952600002 of field area_m2 of feature 1322 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:48,881] {collection.py:558} WARNING - Value 216761399.942699999 of field area_m2 of feature 1455 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:50,455] {collection.py:558} WARNING - Value 1357780499.61680007 of field area_m2 of feature 1785 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:50,732] {collection.py:558} WARNING - Value 137478599.9551 of field area_m2 of feature 2681 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:50,967] {collection.py:558} WARNING - Value 185007599.950700015 of field area_m2 of feature 3620 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:50,988] {collection.py:558} WARNING - Value 186512399.949900001 of field area_m2 of feature 3682 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:51,213] {collection.py:558} WARNING - Value 460308599.762600005 of field area_m2 of feature 4519 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:51,345] {collection.py:558} WARNING - Value 513510299.680899978 of field area_m2 of feature 4817 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:51,388] {collection.py:558} WARNING - Value 106005599.945600003 of field area_m2 of feature 4961 not successfully written. Possibly due to too larger number with respect to field width\n", - "[2023-10-06 18:54:52,416] {io.py:238} INFO - Waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/waterbodiesv0.shp\n" - ] - } - ], - "source": [ - "# Write to disk.\n", - "write_waterbodies_to_file(\n", - " waterbodies_gdf=waterbodies_gdf_4326,\n", - " product_version=product_version,\n", - " output_directory=output_directory,\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/1_123_GenerateWaterbodiesUsingTiles.ipynb b/notebooks/1_123_GenerateWaterbodiesUsingTiles.ipynb deleted file mode 100644 index 0c7ba965..00000000 --- a/notebooks/1_123_GenerateWaterbodiesUsingTiles.ipynb +++ /dev/null @@ -1,538 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e6b0a31b-28d9-41d4-b59b-4ccb900e2528", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import datacube\n", - "import fsspec\n", - "import geopandas as gpd\n", - "import pandas as pd\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import (\n", - " check_dir_exists,\n", - " check_file_exists,\n", - " check_if_s3_uri,\n", - " find_parquet_files,\n", - ")\n", - "from deafrica_waterbodies.make_polygons import (\n", - " check_wetness_thresholds,\n", - " get_polygons_from_tile,\n", - " merge_polygons_at_tile_boundaries\n", - ")\n", - "from deafrica_waterbodies.tiling import (\n", - " filter_tiles,\n", - " get_tiles_ids,\n", - " tile_wofs_ls_summary_alltime,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "39c574ee-39c4-4931-b807-5d3f97900cd2", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2e50efa3-b5ba-44b3-b164-608d17e6413e", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "\n", - "aoi_vector_file = \"data/SenegalBasin.geojson\"\n", - "tile_size_factor = 2\n", - "num_workers = 16\n", - "\n", - "primary_threshold: float = 0.1\n", - "secondary_threshold: float = 0.05\n", - "minimum_valid_observations: int = 128\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile3\"\n", - "overwrite = True" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e2dfbcc2-fd39-4900-815c-09a11c448b8d", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9d0742d7-6ca2-4286-9313-3efda5173f38", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib Paths.\n", - "aoi_vector_file = str(aoi_vector_file)\n", - "output_directory = str(output_directory)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "18db4ad8-7193-47e4-a4b4-44085924a793", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters to use when loading datasets.\n", - "dask_chunks = {\"x\": 3200, \"y\": 3200, \"time\": 1}" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "95a0ca66-2695-42c8-968a-523151d514c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Load the area of interest as a GeoDataFrame.\n", - "if aoi_vector_file is not None:\n", - " try:\n", - " aoi_gdf = gpd.read_file(aoi_vector_file)\n", - " except Exception as error:\n", - " _log.exception(f\"Could not read the file {aoi_vector_file}\")\n", - " raise error\n", - "else:\n", - " aoi_gdf = None" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ea4b4349-4d9c-4e55-aae0-b661ccde8da0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:01:14,843] {tiling.py:113} INFO - New tile size is (192000.0, 192000.0).\n", - "[2023-10-06 21:01:17,564] {tiling.py:132} INFO - Number of wofs_ls_summary_alltime tiles: 1188\n" - ] - } - ], - "source": [ - "# Tile the wofs_ls_summary_alltime product.\n", - "tiles, grid_workflow = tile_wofs_ls_summary_alltime(tile_size_factor)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f134e90d-9c9a-47ca-86da-687846f2568c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1188it [00:03, 370.18it/s]\n" - ] - } - ], - "source": [ - "# Filter the tiles to the area of interest.\n", - "filtered_tile_ids = filter_tiles(tiles, aoi_gdf, num_workers)\n", - "filtered_tiles = {k: v for k, v in tiles.items() if k in filtered_tile_ids}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "73816db0-7445-423a-a616-9966187a0ab6", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory to write generated waterbody polygons to.\n", - "polygons_from_thresholds_dir = os.path.join(output_directory, \"polygons_from_thresholds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "843a0c8b-9bc5-469e-8bc2-9a2a0588265b", - "metadata": {}, - "outputs": [], - "source": [ - "# Set the filesystem to use.\n", - "if check_if_s3_uri(polygons_from_thresholds_dir):\n", - " fs = fsspec.filesystem(\"s3\")\n", - "else:\n", - " fs = fsspec.filesystem(\"file\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "cbe40837-f598-4d7c-a3fd-6c1deb2b0589", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:01:21,159] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n" - ] - } - ], - "source": [ - "# Check if the directory exists. If it does not, create it.\n", - "if not check_dir_exists(polygons_from_thresholds_dir):\n", - " fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)\n", - " _log.info(f\"Created directory {polygons_from_thresholds_dir}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1c85a8d3-9543-4a57-90b8-3ac5e6cae71c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:01:21,380] {3616587049.py:3} INFO - We will be running a hybrid wetness threshold. \n", - "**You have set 0.1 as the primary threshold, which will define the location of the waterbody polygons \n", - " with 0.05 set as the supplementary threshold, which will define the extent/shape of the waterbody polygons.**\n" - ] - } - ], - "source": [ - "# Check if the wetness thresholds have been set correctly.\n", - "minimum_wet_thresholds = [secondary_threshold, primary_threshold]\n", - "_log.info(check_wetness_thresholds(minimum_wet_thresholds))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "26f37f02-ce60-43c9-9dc4-4c5298b15a81", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:01:21,386] {make_polygons.py:502} INFO - Generating water body polygons for tile (85, 49)\n", - "[2023-10-06 21:01:26,699] {make_polygons.py:502} INFO - Generating water body polygons for tile (85, 50)\n", - "[2023-10-06 21:01:29,594] {make_polygons.py:502} INFO - Generating water body polygons for tile (85, 46)\n", - "[2023-10-06 21:01:34,605] {make_polygons.py:502} INFO - Generating water body polygons for tile (85, 48)\n", - "[2023-10-06 21:01:39,030] {make_polygons.py:502} INFO - Generating water body polygons for tile (86, 47)\n", - "[2023-10-06 21:01:44,092] {make_polygons.py:502} INFO - Generating water body polygons for tile (86, 50)\n", - "[2023-10-06 21:01:47,102] {make_polygons.py:502} INFO - Generating water body polygons for tile (86, 48)\n", - "[2023-10-06 21:01:50,953] {make_polygons.py:502} INFO - Generating water body polygons for tile (86, 49)\n", - "[2023-10-06 21:01:54,239] {make_polygons.py:502} INFO - Generating water body polygons for tile (86, 46)\n", - "[2023-10-06 21:02:02,050] {make_polygons.py:502} INFO - Generating water body polygons for tile (87, 47)\n", - "[2023-10-06 21:02:41,724] {make_polygons.py:502} INFO - Generating water body polygons for tile (82, 49)\n", - "[2023-10-06 21:03:11,508] {make_polygons.py:502} INFO - Generating water body polygons for tile (82, 48)\n", - "[2023-10-06 21:03:16,938] {make_polygons.py:502} INFO - Generating water body polygons for tile (83, 48)\n", - "[2023-10-06 21:03:23,764] {make_polygons.py:502} INFO - Generating water body polygons for tile (83, 49)\n", - "[2023-10-06 21:03:43,410] {make_polygons.py:502} INFO - Generating water body polygons for tile (83, 47)\n", - "[2023-10-06 21:03:48,462] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 47)\n", - "[2023-10-06 21:03:52,210] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 48)\n", - "[2023-10-06 21:03:59,143] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 46)\n", - "[2023-10-06 21:04:03,700] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 45)\n", - "[2023-10-06 21:04:08,939] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 50)\n", - "[2023-10-06 21:04:12,047] {make_polygons.py:502} INFO - Generating water body polygons for tile (84, 49)\n", - "[2023-10-06 21:04:16,354] {make_polygons.py:502} INFO - Generating water body polygons for tile (85, 47)\n", - "[2023-10-06 21:04:21,000] {make_polygons.py:502} INFO - Generating water body polygons for tile (87, 48)\n" - ] - } - ], - "source": [ - "# Generate the first set of primary and secondary threhsold polygons for each of the tiles.\n", - "for tile in filtered_tiles.items():\n", - " tile_id = tile[0]\n", - " primary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{tile_id[0]}_{tile_id[1]}_primary_threshold_polygons.parquet\"\n", - " )\n", - " secondary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{tile_id[0]}_{tile_id[1]}_secondary_threshold_polygons.parquet\"\n", - " )\n", - "\n", - " if not overwrite:\n", - " _log.info(f\"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}\")\n", - " exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists(secondary_threshold_polygons_fp)\n", - "\n", - " if overwrite or not exists:\n", - " (\n", - " primary_threshold_polygons,\n", - " secondary_threshold_polygons,\n", - " ) = get_polygons_from_tile(\n", - " tile=tile,\n", - " grid_workflow=grid_workflow,\n", - " dask_chunks=dask_chunks,\n", - " min_valid_observations=minimum_valid_observations,\n", - " primary_threshold=primary_threshold,\n", - " secondary_threshold=secondary_threshold,\n", - " )\n", - " # Write the polygons to parquet files.\n", - " primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp)\n", - " secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "27d552b8-32fd-427c-835b-92245f627d92", - "metadata": {}, - "outputs": [], - "source": [ - "# Get the extents for each tile.\n", - "crs = grid_workflow.grid_spec.crs\n", - "filtered_tiles_extents_geoms = [tile[1].geobox.extent.geom for tile in filtered_tiles.items()]\n", - "filtered_tiles_extents_gdf = gpd.GeoDataFrame(geometry=filtered_tiles_extents_geoms, crs=crs)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "5ab0c48e-feeb-47c0-a13b-fbe3ee940ad7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:28,249] {2643269420.py:3} INFO - Found 23 parquet files for the primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the primary threshold.\n", - "primary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*primary.*\")\n", - "_log.info(f\"Found {len(primary_threshold_polygons_paths)} parquet files for the primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "bfeb0c77-93d9-4cc6-a2d2-2542875abf71", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:28,254] {370454188.py:2} INFO - Loading the primary threshold polygons parquet files..\n", - "[2023-10-06 21:04:30,616] {370454188.py:9} INFO - Found 58371 primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the primary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the primary threshold polygons parquet files..\")\n", - "primary_threshold_polygons_list = []\n", - "for path in primary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " primary_threshold_polygons_list.append(gdf)\n", - "\n", - "primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(primary_threshold_polygons)} primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "5e3fa531-851c-47f9-a9d4-87647b130da4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:30,620] {1380469088.py:1} INFO - Merging primary threshold waterbody polygons located at tile boundaries...\n", - "[2023-10-06 21:04:35,788] {1380469088.py:5} INFO - Primary threshold polygons count 58291.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging primary threshold waterbody polygons located at tile boundaries...\")\n", - "primary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(\n", - " primary_threshold_polygons, filtered_tiles_extents_gdf\n", - ")\n", - "_log.info(f\"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "70430dbd-8ef5-46cb-8def-c9cf5c0dda35", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:35,792] {3770163632.py:1} INFO - Writing primary threshold polygons merged at tile boundaries to disk..\n", - "[2023-10-06 21:04:36,189] {3770163632.py:7} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile3/primary_threshold_polygons_merged_at_tile_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing primary threshold polygons merged at tile boundaries to disk..\")\n", - "primary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"primary_threshold_polygons_merged_at_tile_boundaries.parquet\"\n", - ")\n", - "\n", - "primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp)\n", - "_log.info(f\"Polygons written to {primary_threshold_polygons_output_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "70440491-216a-42d6-b26e-c943b918a6b1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:36,241] {1989755468.py:3} INFO - Found 23 parquet files for the secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the secondary threshold.\n", - "secondary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*secondary.*\")\n", - "_log.info(f\"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "096b45e5-caf9-4df4-8d36-b96cfb733d14", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:36,247] {4278796675.py:2} INFO - Loading the secondary threshold polygons parquet files...\n", - "[2023-10-06 21:04:38,997] {4278796675.py:9} INFO - Found 113853 secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the secondary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the secondary threshold polygons parquet files...\")\n", - "secondary_threshold_polygons_list = []\n", - "for path in secondary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " secondary_threshold_polygons_list.append(gdf)\n", - "\n", - "secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(secondary_threshold_polygons)} secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "b010dca0-9357-4a94-936c-b36e3500e7f1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:39,002] {2540286757.py:1} INFO - Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\n", - "[2023-10-06 21:04:52,572] {2540286757.py:5} INFO - Secondary threshold polygons count 113723.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\")\n", - "secondary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(\n", - " secondary_threshold_polygons, filtered_tiles_extents_gdf\n", - ")\n", - "_log.info(f\"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "703dc6e4-8ce0-48a3-851b-22355daed66c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:04:52,577] {1287271369.py:1} INFO - Writing secondary threshold polygons merged at tile boundaries to disk..\n", - "[2023-10-06 21:04:53,180] {1287271369.py:8} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile3/secondary_threshold_polygons_merged_at_ds_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing secondary threshold polygons merged at tile boundaries to disk..\")\n", - "secondary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"secondary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "\n", - "secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp)\n", - "\n", - "_log.info(f\"Polygons written to {secondary_threshold_polygons_output_fp}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/1_123_GenerateWaterbodiesUsingTileswithOCeanFiltering.ipynb b/notebooks/1_123_GenerateWaterbodiesUsingTileswithOCeanFiltering.ipynb deleted file mode 100644 index 04a654a4..00000000 --- a/notebooks/1_123_GenerateWaterbodiesUsingTileswithOCeanFiltering.ipynb +++ /dev/null @@ -1,559 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e6b0a31b-28d9-41d4-b59b-4ccb900e2528", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "\n", - "import click\n", - "import datacube\n", - "import fsspec\n", - "import geopandas as gpd\n", - "import pandas as pd\n", - "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.io import (\n", - " check_dir_exists,\n", - " check_file_exists,\n", - " check_if_s3_uri,\n", - " find_parquet_files,\n", - ")\n", - "from deafrica_waterbodies.make_polygons import (\n", - " check_wetness_thresholds,\n", - " get_polygons_from_tile_with_land_sea_mask_filtering,\n", - " merge_polygons_at_tile_boundaries\n", - ")\n", - "from deafrica_waterbodies.tiling import (\n", - " filter_tiles,\n", - " get_tiles_ids,\n", - " tile_wofs_ls_summary_alltime,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "39c574ee-39c4-4931-b807-5d3f97900cd2", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# These are the default AWS configurations for the Analysis Sandbox.\n", - "# that are set in the environmnet variables.\n", - "aws_default_config = {\n", - " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", - " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", - " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", - "}\n", - "\n", - "# To access public bucket, need to remove the AWS credentials in\n", - "# the environment variables or the following error will occur.\n", - "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", - "\n", - "for key in aws_default_config.keys():\n", - " if key in os.environ:\n", - " del os.environ[key]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2e50efa3-b5ba-44b3-b164-608d17e6413e", - "metadata": {}, - "outputs": [], - "source": [ - "verbose = 1\n", - "\n", - "aoi_vector_file = \"data/SenegalBasin.geojson\"\n", - "tile_size_factor = 2\n", - "num_workers = 16\n", - "\n", - "primary_threshold: float = 0.1\n", - "secondary_threshold: float = 0.05\n", - "minimum_valid_observations: int = 128\n", - "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile4\"\n", - "overwrite = True\n", - "land_sea_mask_fp = \"data/af_msk_3s.tif\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e98a89fc-74fe-483b-89d5-b268c175d59c", - "metadata": {}, - "outputs": [], - "source": [ - "import xarray as xr\n", - "def filter_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray:\n", - " \"\"\"\n", - " Function to filter the HydroSHEDs Land Mask into a boolean mask.\n", - " \"\"\"\n", - " # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.\n", - " boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2)\n", - " return boolean_mask" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e2dfbcc2-fd39-4900-815c-09a11c448b8d", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", - "_log = logging.getLogger(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "9d0742d7-6ca2-4286-9313-3efda5173f38", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib Paths.\n", - "aoi_vector_file = str(aoi_vector_file)\n", - "output_directory = str(output_directory)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "18db4ad8-7193-47e4-a4b4-44085924a793", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters to use when loading datasets.\n", - "dask_chunks = {\"x\": 3200, \"y\": 3200, \"time\": 1}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "95a0ca66-2695-42c8-968a-523151d514c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Load the area of interest as a GeoDataFrame.\n", - "if aoi_vector_file is not None:\n", - " try:\n", - " aoi_gdf = gpd.read_file(aoi_vector_file)\n", - " except Exception as error:\n", - " _log.exception(f\"Could not read the file {aoi_vector_file}\")\n", - " raise error\n", - "else:\n", - " aoi_gdf = None" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ea4b4349-4d9c-4e55-aae0-b661ccde8da0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:08:36,060] {tiling.py:113} INFO - New tile size is (192000.0, 192000.0).\n", - "[2023-10-06 21:08:38,617] {tiling.py:132} INFO - Number of wofs_ls_summary_alltime tiles: 1188\n" - ] - } - ], - "source": [ - "# Tile the wofs_ls_summary_alltime product.\n", - "tiles, grid_workflow = tile_wofs_ls_summary_alltime(tile_size_factor)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f134e90d-9c9a-47ca-86da-687846f2568c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1188it [00:03, 390.91it/s]\n" - ] - } - ], - "source": [ - "# Filter the tiles to the area of interest.\n", - "filtered_tile_ids = filter_tiles(tiles, aoi_gdf, num_workers)\n", - "filtered_tiles = {k: v for k, v in tiles.items() if k in filtered_tile_ids}" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "73816db0-7445-423a-a616-9966187a0ab6", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory to write generated waterbody polygons to.\n", - "polygons_from_thresholds_dir = os.path.join(output_directory, \"polygons_from_thresholds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "843a0c8b-9bc5-469e-8bc2-9a2a0588265b", - "metadata": {}, - "outputs": [], - "source": [ - "# Set the filesystem to use.\n", - "if check_if_s3_uri(polygons_from_thresholds_dir):\n", - " fs = fsspec.filesystem(\"s3\")\n", - "else:\n", - " fs = fsspec.filesystem(\"file\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "cbe40837-f598-4d7c-a3fd-6c1deb2b0589", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:08:41,978] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n", - "[2023-10-06 21:08:42,201] {3896921575.py:4} INFO - Created directory s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile4/polygons_from_thresholds\n" - ] - } - ], - "source": [ - "# Check if the directory exists. If it does not, create it.\n", - "if not check_dir_exists(polygons_from_thresholds_dir):\n", - " fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)\n", - " _log.info(f\"Created directory {polygons_from_thresholds_dir}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1c85a8d3-9543-4a57-90b8-3ac5e6cae71c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:08:42,205] {3616587049.py:3} INFO - We will be running a hybrid wetness threshold. \n", - "**You have set 0.1 as the primary threshold, which will define the location of the waterbody polygons \n", - " with 0.05 set as the supplementary threshold, which will define the extent/shape of the waterbody polygons.**\n" - ] - } - ], - "source": [ - "# Check if the wetness thresholds have been set correctly.\n", - "minimum_wet_thresholds = [secondary_threshold, primary_threshold]\n", - "_log.info(check_wetness_thresholds(minimum_wet_thresholds))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "26f37f02-ce60-43c9-9dc4-4c5298b15a81", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:08:42,212] {make_polygons.py:627} INFO - Generating water body polygons for tile (85, 49)\n", - "[2023-10-06 21:08:51,413] {make_polygons.py:627} INFO - Generating water body polygons for tile (85, 50)\n", - "[2023-10-06 21:08:57,968] {make_polygons.py:627} INFO - Generating water body polygons for tile (85, 46)\n", - "[2023-10-06 21:09:06,354] {make_polygons.py:627} INFO - Generating water body polygons for tile (85, 48)\n", - "[2023-10-06 21:09:15,018] {make_polygons.py:627} INFO - Generating water body polygons for tile (86, 47)\n", - "[2023-10-06 21:09:24,055] {make_polygons.py:627} INFO - Generating water body polygons for tile (86, 50)\n", - "[2023-10-06 21:09:30,576] {make_polygons.py:627} INFO - Generating water body polygons for tile (86, 48)\n", - "[2023-10-06 21:09:38,003] {make_polygons.py:627} INFO - Generating water body polygons for tile (86, 49)\n", - "[2023-10-06 21:09:45,036] {make_polygons.py:627} INFO - Generating water body polygons for tile (86, 46)\n", - "[2023-10-06 21:09:56,578] {make_polygons.py:627} INFO - Generating water body polygons for tile (87, 47)\n", - "[2023-10-06 21:10:38,041] {make_polygons.py:627} INFO - Generating water body polygons for tile (82, 49)\n", - "[2023-10-06 21:11:08,888] {make_polygons.py:627} INFO - Generating water body polygons for tile (82, 48)\n", - "[2023-10-06 21:11:17,872] {make_polygons.py:627} INFO - Generating water body polygons for tile (83, 48)\n", - "[2023-10-06 21:11:28,670] {make_polygons.py:627} INFO - Generating water body polygons for tile (83, 49)\n", - "[2023-10-06 21:11:52,870] {make_polygons.py:627} INFO - Generating water body polygons for tile (83, 47)\n", - "[2023-10-06 21:12:03,693] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 47)\n", - "[2023-10-06 21:12:11,475] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 48)\n", - "[2023-10-06 21:12:23,154] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 46)\n", - "[2023-10-06 21:12:33,559] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 45)\n", - "[2023-10-06 21:12:42,260] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 50)\n", - "[2023-10-06 21:12:49,233] {make_polygons.py:627} INFO - Generating water body polygons for tile (84, 49)\n", - "[2023-10-06 21:12:57,253] {make_polygons.py:627} INFO - Generating water body polygons for tile (85, 47)\n", - "[2023-10-06 21:13:06,321] {make_polygons.py:627} INFO - Generating water body polygons for tile (87, 48)\n" - ] - } - ], - "source": [ - "# Generate the first set of primary and secondary threhsold polygons for each of the tiles.\n", - "for tile in filtered_tiles.items():\n", - " tile_id = tile[0]\n", - " primary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{tile_id[0]}_{tile_id[1]}_primary_threshold_polygons.parquet\"\n", - " )\n", - " secondary_threshold_polygons_fp = os.path.join(\n", - " polygons_from_thresholds_dir, f\"{tile_id[0]}_{tile_id[1]}_secondary_threshold_polygons.parquet\"\n", - " )\n", - "\n", - " if not overwrite:\n", - " _log.info(f\"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}\")\n", - " exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists(secondary_threshold_polygons_fp)\n", - "\n", - " if overwrite or not exists:\n", - " (\n", - " primary_threshold_polygons,\n", - " secondary_threshold_polygons,\n", - " ) = get_polygons_from_tile_with_land_sea_mask_filtering(\n", - " tile=tile,\n", - " grid_workflow=grid_workflow,\n", - " dask_chunks=dask_chunks,\n", - " min_valid_observations=minimum_valid_observations,\n", - " primary_threshold=primary_threshold,\n", - " secondary_threshold=secondary_threshold,\n", - " land_sea_mask_fp=land_sea_mask_fp,\n", - " filter_land_sea_mask=filter_hydrosheds_land_mask,\n", - " )\n", - " # Write the polygons to parquet files.\n", - " primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp)\n", - " secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "27d552b8-32fd-427c-835b-92245f627d92", - "metadata": {}, - "outputs": [], - "source": [ - "# Get the extents for each tile.\n", - "crs = grid_workflow.grid_spec.crs\n", - "filtered_tiles_extents_geoms = [tile[1].geobox.extent.geom for tile in filtered_tiles.items()]\n", - "filtered_tiles_extents_gdf = gpd.GeoDataFrame(geometry=filtered_tiles_extents_geoms, crs=crs)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5ab0c48e-feeb-47c0-a13b-fbe3ee940ad7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:18,301] {2643269420.py:3} INFO - Found 23 parquet files for the primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the primary threshold.\n", - "primary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*primary.*\")\n", - "_log.info(f\"Found {len(primary_threshold_polygons_paths)} parquet files for the primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "bfeb0c77-93d9-4cc6-a2d2-2542875abf71", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:18,305] {370454188.py:2} INFO - Loading the primary threshold polygons parquet files..\n", - "[2023-10-06 21:13:20,706] {370454188.py:9} INFO - Found 58346 primary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the primary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the primary threshold polygons parquet files..\")\n", - "primary_threshold_polygons_list = []\n", - "for path in primary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " primary_threshold_polygons_list.append(gdf)\n", - "\n", - "primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(primary_threshold_polygons)} primary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "5e3fa531-851c-47f9-a9d4-87647b130da4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:20,710] {1380469088.py:1} INFO - Merging primary threshold waterbody polygons located at tile boundaries...\n", - "[2023-10-06 21:13:25,592] {1380469088.py:5} INFO - Primary threshold polygons count 58265.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging primary threshold waterbody polygons located at tile boundaries...\")\n", - "primary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(\n", - " primary_threshold_polygons, filtered_tiles_extents_gdf\n", - ")\n", - "_log.info(f\"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "70430dbd-8ef5-46cb-8def-c9cf5c0dda35", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:25,597] {3770163632.py:1} INFO - Writing primary threshold polygons merged at tile boundaries to disk..\n", - "[2023-10-06 21:13:26,024] {3770163632.py:7} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile4/primary_threshold_polygons_merged_at_tile_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing primary threshold polygons merged at tile boundaries to disk..\")\n", - "primary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"primary_threshold_polygons_merged_at_tile_boundaries.parquet\"\n", - ")\n", - "\n", - "primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp)\n", - "_log.info(f\"Polygons written to {primary_threshold_polygons_output_fp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "70440491-216a-42d6-b26e-c943b918a6b1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:26,060] {1989755468.py:3} INFO - Found 23 parquet files for the secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Find all parquet files for the secondary threshold.\n", - "secondary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=\".*secondary.*\")\n", - "_log.info(f\"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "096b45e5-caf9-4df4-8d36-b96cfb733d14", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:26,065] {4278796675.py:2} INFO - Loading the secondary threshold polygons parquet files...\n", - "[2023-10-06 21:13:28,841] {4278796675.py:9} INFO - Found 113781 secondary threshold polygons.\n" - ] - } - ], - "source": [ - "# Load all the secondary threshold polygons into a single GeoDataFrame.\n", - "_log.info(\"Loading the secondary threshold polygons parquet files...\")\n", - "secondary_threshold_polygons_list = []\n", - "for path in secondary_threshold_polygons_paths:\n", - " gdf = gpd.read_parquet(path)\n", - " secondary_threshold_polygons_list.append(gdf)\n", - "\n", - "secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True)\n", - "_log.info(f\"Found {len(secondary_threshold_polygons)} secondary threshold polygons.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "b010dca0-9357-4a94-936c-b36e3500e7f1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:28,845] {2540286757.py:1} INFO - Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\n", - "[2023-10-06 21:13:41,709] {2540286757.py:5} INFO - Secondary threshold polygons count 113649.\n" - ] - } - ], - "source": [ - "_log.info(\"Merging secondary threshold waterbody polygons located at dataset/scene boundaries...\")\n", - "secondary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(\n", - " secondary_threshold_polygons, filtered_tiles_extents_gdf\n", - ")\n", - "_log.info(f\"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "703dc6e4-8ce0-48a3-851b-22355daed66c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 21:13:41,714] {1287271369.py:1} INFO - Writing secondary threshold polygons merged at tile boundaries to disk..\n", - "[2023-10-06 21:13:42,312] {1287271369.py:8} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile4/secondary_threshold_polygons_merged_at_ds_boundaries.parquet\n" - ] - } - ], - "source": [ - "_log.info(\"Writing secondary threshold polygons merged at tile boundaries to disk..\")\n", - "secondary_threshold_polygons_output_fp = os.path.join(\n", - " output_directory, \"secondary_threshold_polygons_merged_at_ds_boundaries.parquet\"\n", - ")\n", - "\n", - "secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp)\n", - "\n", - "_log.info(f\"Polygons written to {secondary_threshold_polygons_output_fp}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/01_GetDatasetIDs.ipynb b/notebooks/ApplyFilters.ipynb similarity index 50% rename from notebooks/01_GetDatasetIDs.ipynb rename to notebooks/ApplyFilters.ipynb index cc550336..6a75bcd8 100644 --- a/notebooks/01_GetDatasetIDs.ipynb +++ b/notebooks/ApplyFilters.ipynb @@ -3,29 +3,26 @@ { "cell_type": "code", "execution_count": 1, - "id": "727a0f08-2a9c-471c-bbea-346bf528c856", + "id": "35b518b7-59c1-42e5-8eca-4e6831a43018", "metadata": {}, "outputs": [], "source": [ "import logging\n", "\n", - "import click\n", - "import fsspec\n", "import geopandas as gpd\n", + "\n", "from deafrica_waterbodies.cli.logs import logging_setup\n", - "from deafrica_waterbodies.datasets import get_datasets_ids\n", - "from deafrica_waterbodies.io import check_if_s3_uri" + "from deafrica_waterbodies.filters import filter_using_land_sea_mask" ] }, { "cell_type": "code", "execution_count": 2, - "id": "25b37fcb-eb02-4989-8794-326ed93a37b1", + "id": "597d7da8-d56e-472b-96c6-002f430e5e0f", "metadata": {}, "outputs": [], "source": [ "import os\n", - "\n", "# These are the default AWS configurations for the Analysis Sandbox.\n", "# that are set in the environmnet variables.\n", "aws_default_config = {\n", @@ -46,114 +43,94 @@ { "cell_type": "code", "execution_count": 3, - "id": "dd9261b5-77ff-4b4d-884c-609771c2a656", + "id": "6de91524-3642-426e-8c86-711e4487cad4", "metadata": {}, "outputs": [], "source": [ "verbose = 1\n", - "aoi_vector_file = \"data/SenegalBasin.geojson\"\n", - "num_workers = 16\n", - "dataset_ids_text_file = \"s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt\"" + "output_directory = \"s3://deafrica-waterbodies-dev/0-0-1/shapefile/\"\n", + "polygons_vector_file = \"s3://deafrica-waterbodies-dev/0-0-1/shapefile/senegalbasinwaterbodiesv0_polygons_within_polygons_removed.parquet\"\n", + "land_sea_mask_fp = \"data/goas_v01.shp\"" ] }, { "cell_type": "code", "execution_count": 4, - "id": "6d698093-e0b7-4c3c-bcca-e630bddcce73", + "id": "98d95ab9-904b-4a2a-b2b3-5cd0dc1428eb", "metadata": {}, "outputs": [], "source": [ "# Set up logger.\n", - "logging_setup(verbose=verbose)\n", + "logging_setup(verbose=1)\n", "_log = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 5, - "id": "268c4be3-1301-4d39-bd32-71d64a31bb38", - "metadata": {}, - "outputs": [], - "source": [ - "# Support pathlib Paths.\n", - "aoi_vector_file = str(aoi_vector_file)\n", - "dataset_ids_text_file = str(dataset_ids_text_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "00bf86bf-eb0f-4b7b-9b19-dee84fdac498", + "id": "590920e7-7c91-4c14-8060-53395dc61214", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-19 20:38:52,414] {3539245125.py:2} INFO - Loading polygons...\n", + "[2023-10-19 20:39:00,883] {3539245125.py:11} INFO - Polygons count 35009.\n" + ] + } + ], "source": [ - "# Load the area of interest as a GeoDataFrame.\n", - "if aoi_vector_file is not None:\n", - " try:\n", - " aoi_gdf = gpd.read_file(aoi_vector_file)\n", - " except Exception as error:\n", - " _log.exception(f\"Could not read the file {aoi_vector_file}\")\n", - " raise error\n", + "# Load the polygons\n", + "_log.info(\"Loading polygons...\")\n", + "\n", + "try:\n", + " polygons_gdf = gpd.read_file(polygons_vector_file)\n", + "except Exception as error:\n", + " _log.exception(f\"Could not read file {polygons_vector_file}\")\n", + " _log.error(error)\n", + " raise error\n", "else:\n", - " aoi_gdf = None" + " _log.info(f\"Polygons count {len(polygons_gdf)}.\")" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "858ba36e-e717-44da-ba56-a7dc6ebc90c5", + "execution_count": 6, + "id": "3e3780dc-3cc6-459d-9171-d52c56999b01", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "4461it [00:07, 586.18it/s]\n" + "[2023-10-19 20:39:00,890] {filters.py:159} INFO - Filtering out ocean polygons from the water body polygons...\n", + "[2023-10-19 20:39:56,842] {filters.py:174} INFO - Filtered out 1111 water body polygons.\n" ] } ], "source": [ - "# Get the WOfS All Time Summary scene ids for the scenes whose extent\n", - "# intersects with the area of interest.\n", - "dataset_ids = get_datasets_ids(aoi_gdf=aoi_gdf, num_workers=num_workers)" + "filtered_polygons_gdf = filter_using_land_sea_mask(polygons_gdf, land_sea_mask_fp)" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "a55a9af2-cff4-4843-8230-ab35cf897faa", + "execution_count": 7, + "id": "ca5d977c-157f-4941-8417-7063886284fb", "metadata": {}, "outputs": [], "source": [ - "# Set the filesystem to use.\n", - "if check_if_s3_uri(dataset_ids_text_file):\n", - " fs = fsspec.filesystem(\"s3\")\n", - "else:\n", - " fs = fsspec.filesystem(\"file\")" + "filtered_polygons_fp = os.path.join(output_directory, \"filtered_using_land_sea_mask.parquet\")" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "a3323f56-f442-40d9-a2fd-0927a1d149f4", + "execution_count": 8, + "id": "e128cae9-b2c2-46bb-9339-223d94e26273", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2023-10-06 16:39:44,855] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials\n", - "[2023-10-06 16:39:45,022] {2876337735.py:6} INFO - Dataset IDs written to: s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt.\n" - ] - } - ], + "outputs": [], "source": [ - "# Write the dataset ids to the text file.\n", - "with fs.open(dataset_ids_text_file, \"w\") as file:\n", - " for dataset_id in dataset_ids:\n", - " file.write(f\"{dataset_id}\\n\")\n", - "\n", - "_log.info(f\"Dataset IDs written to: {dataset_ids_text_file}.\")" + "filtered_polygons_gdf.to_parquet(filtered_polygons_fp)" ] } ], diff --git a/notebooks/FixHolesInPolygons.ipynb b/notebooks/FixHolesInPolygons.ipynb new file mode 100644 index 00000000..7c6e58e9 --- /dev/null +++ b/notebooks/FixHolesInPolygons.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cdabd764-0960-4bcd-84f5-e007852c3869", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import math\n", + "import os\n", + "\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "from deafrica_waterbodies.cli.logs import logging_setup\n", + "from deafrica_waterbodies.filters import fill_holes, filter_by_area" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3af67100-ff19-440f-b6e5-fe5724244ee5", + "metadata": {}, + "outputs": [], + "source": [ + "# These are the default AWS configurations for the Analysis Sandbox.\n", + "# that are set in the environmnet variables.\n", + "aws_default_config = {\n", + " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", + " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", + " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", + "}\n", + "\n", + "# To access public bucket, need to remove the AWS credentials in\n", + "# the environment variables or the following error will occur.\n", + "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", + "\n", + "for key in aws_default_config.keys():\n", + " if key in os.environ:\n", + " del os.environ[key]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7acc8246-b844-4e92-bc15-204bb54702e8", + "metadata": {}, + "outputs": [], + "source": [ + "verbose = 1\n", + "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental\"\n", + "# output_directory = \"s3://deafrica-waterbodies-dev/0-0-1/shapefile/\"\n", + "large_polygons_threshold = 10**6" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "32a30849-c86a-4b8a-947c-cc12245d618f", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up logger.\n", + "logging_setup(verbose=verbose)\n", + "_log = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1670f7c9-b5bf-4c41-b1c7-150da3728985", + "metadata": {}, + "outputs": [], + "source": [ + "# Support pathlib paths.\n", + "output_directory = str(output_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1a0e9e0-2af9-473c-bdd2-65b4d64032f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-23 11:19:40,565] {3544693789.py:2} INFO - Loading polygons...\n", + "[2023-10-23 11:19:51,472] {3544693789.py:11} INFO - Raster polygons count 1075799.\n" + ] + } + ], + "source": [ + "# Load the raster polygons\n", + "_log.info(\"Loading polygons...\")\n", + "\n", + "raster_polygons_fp = os.path.join(output_directory, \"raster_polygons_merged_at_tile_boundaries.parquet\")\n", + "\n", + "raster_polygons = gpd.read_parquet(raster_polygons_fp)\n", + "\n", + "# Drop the attributes column if it exists.\n", + "raster_polygons.drop(columns=[\"attribute\"], errors=\"ignore\", inplace=True)\n", + "\n", + "_log.info(f\"Raster polygons count {len(raster_polygons)}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d1fd7ad2-f1c6-4f20-95ab-30a9ebc1bbd8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-23 11:19:51,477] {filters.py:117} INFO - Filtering 1075799 polygons by minimum area 1000000 and max area inf...\n", + "[2023-10-23 11:19:51,796] {filters.py:130} INFO - Filtered out 1056504 polygons.\n", + "[2023-10-23 11:19:51,796] {640283564.py:3} INFO - Count for polygons larger than 1000000 m2: 19295\n" + ] + } + ], + "source": [ + "# Identify the large polygons.\n", + "large_polygons = filter_by_area(raster_polygons, min_polygon_size=large_polygons_threshold, max_polygon_size=math.inf)\n", + "_log.info(f\"Count for polygons larger than {large_polygons_threshold} m2: {len(large_polygons)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5e1dac6a-56d6-40b7-aaa5-e618ea000982", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-23 11:19:51,929] {2675327616.py:4} INFO - 19295 large polygons removed from raster polygons.\n" + ] + } + ], + "source": [ + "# Remove the large polygons from the raster polygons.\n", + "large_polygons_idx = large_polygons.index.values\n", + "raster_polygons_large_removed = raster_polygons.drop(index=large_polygons_idx)\n", + "_log.info(f\"{len(raster_polygons) - len(raster_polygons_large_removed)} large polygons removed from raster polygons.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "54410f6f-3ff7-457a-9e56-156128815a74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-23 11:19:51,934] {:2} INFO - Filling holes in large polygons...\n", + "CPU times: user 17.8 s, sys: 353 ms, total: 18.2 s\n", + "Wall time: 18.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Fill holes in the large polygons.\n", + "_log.info(\"Filling holes in large polygons...\")\n", + "large_polygons.geometry = large_polygons.geometry.apply(lambda p: fill_holes(p))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d89bc5d1-6a14-4450-b78a-4fd801aa576f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-23 11:20:10,201] {3685114858.py:6} INFO - Polygon count after filling holes in large polygons 1075799.\n" + ] + } + ], + "source": [ + "# Add back in the large polygons with holes filled.\n", + "raster_polygons_with_holes_filled = pd.concat([raster_polygons_large_removed, large_polygons], ignore_index=True)\n", + "\n", + "raster_polygons_with_holes_filled.drop(columns=[\"area_m2\"], errors=\"ignore\", inplace=True)\n", + "\n", + "_log.info(f\"Polygon count after filling holes in large polygons {len(raster_polygons_with_holes_filled)}.\")\n", + "raster_polygons_with_holes_filled.to_parquet(os.path.join(output_directory, \"raster_polygons_with_holes_filled.parquet\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/GeneratePolygonsUsingTiles.ipynb b/notebooks/GeneratePolygonsUsingTiles.ipynb new file mode 100644 index 00000000..b7a1592b --- /dev/null +++ b/notebooks/GeneratePolygonsUsingTiles.ipynb @@ -0,0 +1,559 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b607bfb4-872e-4e69-8be2-cbc7a6c686e7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# These are the default AWS configurations for the Analysis Sandbox.\n", + "# that are set in the environmnet variables.\n", + "aws_default_config = {\n", + " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", + " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", + " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", + "}\n", + "\n", + "# To access public bucket, need to remove the AWS credentials in\n", + "# the environment variables or the following error will occur.\n", + "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", + "\n", + "for key in aws_default_config.keys():\n", + " if key in os.environ:\n", + " del os.environ[key]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c399025b-61f8-40bf-9c0f-5619e8a8b605", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import math\n", + "import os\n", + "from importlib import import_module\n", + "\n", + "import click\n", + "import fsspec\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from deafrica_waterbodies.attributes import (\n", + " add_polygon_properties,\n", + " add_timeseries_attribute,\n", + " assign_unique_ids,\n", + ")\n", + "from deafrica_waterbodies.cli.logs import logging_setup\n", + "from deafrica_waterbodies.filters import filter_by_area, filter_by_length\n", + "from deafrica_waterbodies.group_polygons import split_polygons_by_region\n", + "from deafrica_waterbodies.io import (\n", + " check_dir_exists,\n", + " check_file_exists,\n", + " check_if_s3_uri,\n", + " find_parquet_files,\n", + " write_waterbodies_to_file,\n", + ")\n", + "from deafrica_waterbodies.make_polygons import (\n", + " merge_polygons_at_tile_boundaries,\n", + " process_raster_polygons,\n", + " set_wetness_thresholds,\n", + ")\n", + "from deafrica_waterbodies.plugins.utils import run_plugin, validate_plugin\n", + "from deafrica_waterbodies.tiling import get_wofs_ls_summary_alltime_tiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c92d45c-5adc-4fc9-8c40-b216f3a005a9", + "metadata": {}, + "outputs": [], + "source": [ + "verbose = 1\n", + "aoi_vector_file=None\n", + "tile_size_factor=4\n", + "num_workers = 8\n", + "detection_threshold = 0.1\n", + "extent_threshold = 0.05\n", + "min_valid_observations = 60\n", + "raster_processing_plugin_name = \"ocean_filtering_using_hydrosheds\"\n", + "output_directory = \"s3://deafrica-waterbodies-dev/waterbodies/v0.0.2\"\n", + "overwrite = \"True\"\n", + "min_polygon_size = 4500\n", + "max_polygon_size = math.inf\n", + "timeseries_directory = (\n", + " \"s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/surface_area_change/\"\n", + ")\n", + "file_name_prefix = \"waterbodies\"\n", + "land_sea_mask_fp = \"/g/data/deafrica-waterbodies/masks/af_msk_3s.tif\"\n", + "group_by_wofs_ls_regions = True\n", + "length_threshold_km = 150" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7132d9d-f150-48d5-90f7-b83ada1758c0", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up logger.\n", + "logging_setup(verbose=verbose)\n", + "_log = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "184184d8-1921-4f70-b9d6-3c7795a5e78e", + "metadata": {}, + "outputs": [], + "source": [ + "# Parameters to use when loading datasetspolygons_split_by_region_dir.\n", + "# Chunks selected based on size of WOfs scene.\n", + "dask_chunks = {\"x\": 3200, \"y\": 3200, \"time\": 1}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f092862a-38d5-43b1-8717-e150bc2602e6", + "metadata": {}, + "outputs": [], + "source": [ + "# Support pathlib Paths.\n", + "if aoi_vector_file is not None:\n", + " aoi_vector_file = str(aoi_vector_file)\n", + "\n", + "output_directory = str(output_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4905772e-acaa-4452-a686-e9e23f1f9192", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory to write outputs from intermediate steps\n", + "intermediate_outputs_dir = os.path.join(output_directory, \"intermediate_outputs\")\n", + "# Directory to write generated first set of waterbody polygons to.\n", + "polygons_from_thresholds_dir = os.path.join(\n", + " intermediate_outputs_dir, \"polygons_from_thresholds\"\n", + ")\n", + "# Directory to write final output.\n", + "final_outputs_dir = os.path.join(output_directory, \"historical_extent\")\n", + "# Directory to store polygons split by region.\n", + "polygons_split_by_region_dir = os.path.join(\n", + " output_directory, \"historical_extent_split_by_wofs_region\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42cc868e-9542-48a3-947a-0938ec225fe4", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the filesystem to use.\n", + "if check_if_s3_uri(output_directory):\n", + " fs = fsspec.filesystem(\"s3\")\n", + "else:\n", + " fs = fsspec.filesystem(\"file\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c9169aa-aa7d-47ac-b5c9-60e132a49667", + "metadata": {}, + "outputs": [], + "source": [ + "if not check_dir_exists(intermediate_outputs_dir):\n", + " fs.mkdirs(intermediate_outputs_dir, exist_ok=True)\n", + " _log.info(f\"Created directory {intermediate_outputs_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41e53818-e4c4-45cb-bfa4-8ef3c238e048", + "metadata": {}, + "outputs": [], + "source": [ + "if not check_dir_exists(polygons_from_thresholds_dir):\n", + " fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)\n", + " _log.info(f\"Created directory {polygons_from_thresholds_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4eded234-9771-415d-b4fc-e19936c1f341", + "metadata": {}, + "outputs": [], + "source": [ + "if not check_dir_exists(final_outputs_dir):\n", + " fs.mkdirs(final_outputs_dir, exist_ok=True)\n", + " _log.info(f\"Created directory {final_outputs_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8aec2828-a15e-47cd-8d5b-c6e28346d0d9", + "metadata": {}, + "outputs": [], + "source": [ + "if group_by_wofs_ls_regions:\n", + " if not check_dir_exists(polygons_split_by_region_dir):\n", + " fs.mkdirs(polygons_split_by_region_dir, exist_ok=True)\n", + " _log.info(f\"Created directory {polygons_split_by_region_dir}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75aa5e30-f4a3-4e88-8e55-4fcab4de580f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the area of interest as a GeoDataFrame.\n", + "if aoi_vector_file is not None:\n", + " try:\n", + " aoi_gdf = gpd.read_file(aoi_vector_file)\n", + " except Exception as error:\n", + " _log.exception(f\"Could not read the file {aoi_vector_file}\")\n", + " raise error\n", + "else:\n", + " aoi_gdf = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7569db1-2ad2-40dc-8b58-97c2fdc0c574", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the tiles fo the wofs_ls_summary_alltime product.\n", + "tiles, grid_workflow = get_wofs_ls_summary_alltime_tiles(\n", + " aoi_gdf=aoi_gdf, tile_size_factor=tile_size_factor, num_workers=num_workers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97737d0-e14d-456a-9167-3960106552d4", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the wetness thresholds.\n", + "min_wet_thresholds = set_wetness_thresholds(\n", + " detection_threshold=detection_threshold, extent_threshold=extent_threshold\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c06fc4c1-8ad0-46db-b5e8-45bf695ad7e6", + "metadata": {}, + "outputs": [], + "source": [ + "# Set filters to apply during raster processing.\n", + "if raster_processing_plugin_name is not None:\n", + " # Read the plugin as a Python module.\n", + " module = import_module(\n", + " f\"deafrica_waterbodies.plugins.{raster_processing_plugin_name}\"\n", + " )\n", + " plugin_file = module.__file__\n", + " plugin = run_plugin(plugin_file)\n", + " _log.info(f\"Using plugin {plugin_file}\")\n", + " validate_plugin(plugin)\n", + "else:\n", + " plugin = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b3954c2-b769-4a8c-ac55-b3f1c9242ccb", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the first set of polygons for each of the tiles.\n", + "for tile in tiles.items():\n", + " tile_id = tile[0]\n", + " raster_polygons_fp = os.path.join(\n", + " polygons_from_thresholds_dir,\n", + " f\"{tile_id[0]}_{tile_id[1]}_raster_polygons.parquet\",\n", + " )\n", + "\n", + " if not overwrite:\n", + " _log.info(f\"Checking existence of {raster_polygons_fp}\")\n", + " exists = check_file_exists(raster_polygons_fp)\n", + " if exists:\n", + " _log.info(\n", + " f\"{raster_polygons_fp} exists! \\n Skipping generating water body polygons for {tile_id}.\"\n", + " )\n", + "\n", + " if overwrite or not exists:\n", + " try:\n", + " _log.info(f\"Generating water body polygons for tile {tile_id}.\")\n", + " raster_polygons = process_raster_polygons(\n", + " tile=tile,\n", + " grid_workflow=grid_workflow,\n", + " plugin=plugin,\n", + " dask_chunks=dask_chunks,\n", + " min_valid_observations=min_valid_observations,\n", + " min_wet_thresholds=min_wet_thresholds,\n", + " land_sea_mask_fp=land_sea_mask_fp,\n", + " )\n", + " if raster_polygons.empty:\n", + " _log.info(f\"Tile {str(tile_id)} contains no water body polygons.\")\n", + " else:\n", + " # Drop the attributes column if it exists.\n", + " raster_polygons.drop(\n", + " columns=[\"attribute\"], errors=\"ignore\", inplace=True\n", + " )\n", + " # Write the polygons to parquet files.\n", + " raster_polygons.to_parquet(raster_polygons_fp)\n", + " _log.info(\n", + " f\"Tile {str(tile_id)} water body polygons written to {raster_polygons_fp}\"\n", + " )\n", + " except Exception as error:\n", + " _log.exception(f\"\\nTile {str(tile_id)} did not run. \\n\")\n", + " _log.exception(error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0768855d-de9f-4397-bdec-7c5264449645", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the extent for each tile.\n", + "crs = grid_workflow.grid_spec.crs\n", + "tile_ids = [tile[0] for tile in tiles.items()]\n", + "tile_extents_geoms = [tile[1].geobox.extent.geom for tile in tiles.items()]\n", + "tile_extents_gdf = gpd.GeoDataFrame(\n", + " {\"tile_id\": tile_ids, \"geometry\": tile_extents_geoms}, crs=crs\n", + ")\n", + "\n", + "tile_extents_fp = os.path.join(intermediate_outputs_dir, \"tile_boundaries.parquet\")\n", + "\n", + "tile_extents_gdf.to_parquet(tile_extents_fp)\n", + "_log.info(f\"Tile boundaries written to {tile_extents_fp}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37caa0c7-73fa-4c5c-8c89-b9c597aa4bd2", + "metadata": {}, + "outputs": [], + "source": [ + "# Find all parquet files for the first set of polygons.\n", + "raster_polygon_paths = find_parquet_files(\n", + " path=polygons_from_thresholds_dir, pattern=\".*raster_polygons.*\"\n", + ")\n", + "_log.info(f\"Found {len(raster_polygon_paths)} parquet files for the raster polygons.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc8b2839-476e-45e1-9353-35f27bc3bc39", + "metadata": {}, + "outputs": [], + "source": [ + "# Load all polygons into a single GeoDataFrame.\n", + "_log.info(\"Loading the raster polygons parquet files..\")\n", + "raster_polygon_polygons_list = []\n", + "for path in raster_polygon_paths:\n", + " gdf = gpd.read_parquet(path)\n", + " raster_polygon_polygons_list.append(gdf)\n", + "\n", + "raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True)\n", + "_log.info(f\"Found {len(raster_polygons)} raster polygons.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fa2c1c1-b91c-4123-8001-45e26846d9d5", + "metadata": {}, + "outputs": [], + "source": [ + "_log.info(\"Merging raster waterbody polygons located at tile boundaries...\")\n", + "raster_polygons_merged = merge_polygons_at_tile_boundaries(\n", + " raster_polygons, tile_extents_gdf\n", + ")\n", + "# Drop the attributes column if it exists.\n", + "raster_polygons_merged.drop(columns=[\"attribute\"], errors=\"ignore\", inplace=True)\n", + "_log.info(\n", + " f\"Raster polygons count after merging polygons at tile boundaries {len(raster_polygons_merged)}.\"\n", + ")\n", + "\n", + "_log.info(\"Writing raster polygons merged at tile boundaries to disk..\")\n", + "raster_polygons_merged_fp = os.path.join(\n", + " intermediate_outputs_dir, \"raster_polygons_merged_at_tile_boundaries.parquet\"\n", + ")\n", + "raster_polygons_merged.to_parquet(raster_polygons_merged_fp)\n", + "_log.info(f\"Polygons written to {raster_polygons_merged_fp}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aca91459-e7ae-4c69-9b74-52b1d87b1e94", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete to conserve memeory\n", + "del raster_polygons\n", + "del tile_extents_gdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea283da0-cbd9-41ee-bacc-0529b5167c49", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter the polygons by area.\n", + "area_filtered_raster_polygons = filter_by_area(\n", + " raster_polygons_merged,\n", + " min_polygon_size=min_polygon_size,\n", + " max_polygon_size=max_polygon_size,\n", + ")\n", + "area_filtered_raster_polygons.to_parquet(\n", + " os.path.join(intermediate_outputs_dir, \"area_filtered_raster_polygons.parquet\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91fa202b-4702-4910-8d03-7233fbbc01ec", + "metadata": {}, + "outputs": [], + "source": [ + "waterbodies_gdf = assign_unique_ids(\n", + " polygons=area_filtered_raster_polygons, precision=10\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27132243-4a0b-40f1-8d90-005c0e42b6d1", + "metadata": {}, + "outputs": [], + "source": [ + "waterbodies_gdf = add_polygon_properties(polygons=waterbodies_gdf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a65a786-54e3-4c05-be9a-0e6b941b3442", + "metadata": {}, + "outputs": [], + "source": [ + "waterbodies_gdf = add_timeseries_attribute(\n", + " polygons=waterbodies_gdf,\n", + " timeseries_directory=timeseries_directory,\n", + " region_code=\"af-south-1\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c6cb3e4-c8de-4154-8d07-222c0fec33d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Reproject to EPSG:4326\n", + "waterbodies_gdf_4326 = waterbodies_gdf.to_crs(\"EPSG:4326\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "320e6d92-7f8a-4c8e-81f5-c724c6b8eb7e", + "metadata": {}, + "outputs": [], + "source": [ + "# Write to disk.\n", + "write_waterbodies_to_file(\n", + " waterbodies_gdf=waterbodies_gdf_4326,\n", + " output_directory=final_outputs_dir,\n", + " file_name_prefix=file_name_prefix,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa359c39-de0c-4ad6-ad88-dd516ad62972", + "metadata": {}, + "outputs": [], + "source": [ + "waterbodies_gdf_4326.to_parquet(\n", + " os.path.join(final_outputs_dir, f\"{file_name_prefix}.parquet\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71f6e840-511c-4a81-9b0e-51c406639828", + "metadata": {}, + "outputs": [], + "source": [ + "if group_by_wofs_ls_regions:\n", + " waterbodies_gdf_4326 = filter_by_length(\n", + " polygons_gdf=waterbodies_gdf_4326, length_threshold_km=length_threshold_km\n", + ")\n", + " \n", + " split_by_region_fps = split_polygons_by_region(\n", + " polygons_gdf=waterbodies_gdf_4326,\n", + " output_directory=polygons_split_by_region_dir,\n", + " product=\"wofs_ls\",\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/HandleLargePolygons.ipynb b/notebooks/HandleLargePolygons.ipynb new file mode 100644 index 00000000..23147933 --- /dev/null +++ b/notebooks/HandleLargePolygons.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cdabd764-0960-4bcd-84f5-e007852c3869", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import logging\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + " \n", + "from deafrica_waterbodies.cli.logs import logging_setup\n", + "from deafrica_waterbodies.filters import split_large_polygons, filter_by_area" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3af67100-ff19-440f-b6e5-fe5724244ee5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# These are the default AWS configurations for the Analysis Sandbox.\n", + "# that are set in the environmnet variables.\n", + "aws_default_config = {\n", + " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", + " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", + " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", + "}\n", + "\n", + "# To access public bucket, need to remove the AWS credentials in\n", + "# the environment variables or the following error will occur.\n", + "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", + "\n", + "for key in aws_default_config.keys():\n", + " if key in os.environ:\n", + " del os.environ[key]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7acc8246-b844-4e92-bc15-204bb54702e8", + "metadata": {}, + "outputs": [], + "source": [ + "verbose = 1\n", + "output_directory = \"s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental\"\n", + "# output_directory = \"s3://deafrica-waterbodies-dev/0-0-1/shapefile/\"\n", + "handle_large_polygons = \"erode-dilate-v1\"\n", + "pp_test_threshold = 0.005\n", + "large_polygons_threshold = 10**8" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "32a30849-c86a-4b8a-947c-cc12245d618f", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up logger.\n", + "logging_setup(verbose=verbose)\n", + "_log = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1670f7c9-b5bf-4c41-b1c7-150da3728985", + "metadata": {}, + "outputs": [], + "source": [ + "# Support pathlib paths.\n", + "output_directory = str(output_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1a0e9e0-2af9-473c-bdd2-65b4d64032f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-18 20:57:38,824] {98882911.py:2} INFO - Loading raster polygons...\n", + "[2023-10-18 20:57:39,321] {98882911.py:8} INFO - Raster polygons count 53977.\n" + ] + } + ], + "source": [ + "# Load the raster polygons\n", + "_log.info(\"Loading raster polygons...\")\n", + "\n", + "raster_polygons_fp = os.path.join(output_directory, \"raster_polygons_with_holes_filled.parquet\")\n", + "\n", + "raster_polygons = gpd.read_parquet(raster_polygons_fp)\n", + "\n", + "# Drop the attributes column if it exists.\n", + "raster_polygons.drop(columns=[\"attribute\"], errors=\"ignore\", inplace=True)\n", + "\n", + "_log.info(f\"Raster polygons count {len(raster_polygons)}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "24a66e88-c7de-45e4-84ab-7f025592c6cb", + "metadata": {}, + "source": [ + "### Run split on large polygons" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "55cf3ada-4911-4340-a12e-1cabd284f4c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-18 20:57:42,803] {filters.py:117} INFO - Filtering 53977 polygons by minimum area 100000000 and max area inf...\n", + "[2023-10-18 20:57:42,824] {filters.py:130} INFO - Filtered out 53961 polygons.\n", + "[2023-10-18 20:57:42,824] {2569074451.py:3} INFO - Count for polygons larger than 100000000 m2: 16\n" + ] + } + ], + "source": [ + "# Identify the large polygons.\n", + "large_polygons = filter_by_area(raster_polygons, min_polygon_size=large_polygons_threshold, max_polygon_size=math.inf)\n", + "_log.info(f\"Count for polygons larger than {large_polygons_threshold} m2: {len(large_polygons)}\")\n", + "large_polygons.to_parquet(os.path.join(output_directory, \"large_polygons.parquet\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9d2f5aae-5c16-421f-94f5-bc5c30866dae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-18 20:57:53,816] {36683088.py:5} INFO - 16 large polygons removed from raster polygons.\n" + ] + } + ], + "source": [ + "# Remove the large polygons from the raster polygons.\n", + "large_polygons_idx = large_polygons.index.values\n", + "raster_polygons_large_removed = raster_polygons.drop(index=large_polygons_idx)\n", + "raster_polygons_large_removed.drop(columns=[\"area_m2\"], errors=\"ignore\", inplace=True)\n", + "_log.info(f\"{len(raster_polygons) - len(raster_polygons_large_removed)} large polygons removed from raster polygons.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ea668664-51e3-4dd5-9d7c-92897361391b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-18 20:57:56,801] {filters.py:605} INFO - Splitting large polygons using the `erode-dilate-v1` method, using the threshold 0.005.\n", + "[2023-10-18 20:57:56,811] {filters.py:416} INFO - Splitting 7 polygons.\n", + "[2023-10-18 20:59:41,346] {filters.py:436} INFO - Polygon count after splitting using erode-dilate-v1 method: 1142\n", + "CPU times: user 1min 44s, sys: 174 ms, total: 1min 44s\n", + "Wall time: 1min 44s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Split the large polygons.\n", + "large_polygons_handled = split_large_polygons(waterbody_polygons=large_polygons, pp_test_threshold=pp_test_threshold, method=handle_large_polygons)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ae891373-f157-44df-b47c-d3265e27c5cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-18 20:59:41,358] {2280308349.py:3} INFO - Polygon count after handling large polygons 55103.\n" + ] + } + ], + "source": [ + "# Add back in the newly split polygons.\n", + "raster_polygons_large_poly_split = pd.concat([raster_polygons_large_removed, large_polygons_handled], ignore_index=True)\n", + "_log.info(f\"Polygon count after handling large polygons {len(raster_polygons_large_poly_split)}.\")\n", + "raster_polygons_large_poly_split.to_parquet(os.path.join(output_directory, \"raster_polygons_large_polygons_handled.parquet\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/RemovePolygonswithinOtherPolygons.ipynb b/notebooks/RemovePolygonswithinOtherPolygons.ipynb new file mode 100644 index 00000000..dbb9d29b --- /dev/null +++ b/notebooks/RemovePolygonswithinOtherPolygons.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "35b518b7-59c1-42e5-8eca-4e6831a43018", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "\n", + "import geopandas as gpd\n", + "from deafrica_waterbodies.cli.logs import logging_setup\n", + "from deafrica_waterbodies.id_field import guess_id_field\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "597d7da8-d56e-472b-96c6-002f430e5e0f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# These are the default AWS configurations for the Analysis Sandbox.\n", + "# that are set in the environmnet variables.\n", + "aws_default_config = {\n", + " # \"AWS_NO_SIGN_REQUEST\": \"YES\",\n", + " \"AWS_SECRET_ACCESS_KEY\": \"fake\",\n", + " \"AWS_ACCESS_KEY_ID\": \"fake\",\n", + "}\n", + "\n", + "# To access public bucket, need to remove the AWS credentials in\n", + "# the environment variables or the following error will occur.\n", + "# PermissionError: The AWS Access Key Id you provided does not exist in our records.\n", + "\n", + "for key in aws_default_config.keys():\n", + " if key in os.environ:\n", + " del os.environ[key]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "98d95ab9-904b-4a2a-b2b3-5cd0dc1428eb", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up logger.\n", + "logging_setup(verbose=1)\n", + "_log = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "590920e7-7c91-4c14-8060-53395dc61214", + "metadata": {}, + "outputs": [], + "source": [ + "polygons_vector_file = \"s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental/continentalwaterbodies.parquet\"\n", + "use_id = \"UID\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "55339faf-00bf-4ab7-9456-a0e26d1527fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-19 18:55:19,696] {3451265644.py:9} INFO - Polygon count 11654\n" + ] + } + ], + "source": [ + "# Read the vector file.\n", + "try:\n", + " polygons_gdf = gpd.read_file(polygons_vector_file)\n", + "except Exception as error:\n", + " _log.exception(f\"Could not read file {polygons_vector_file}\")\n", + " _log.error(error)\n", + " raise error\n", + "else:\n", + " _log.info(f\"Polygon count {len(polygons_gdf)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "473dc11c-b833-463a-ac83-978fd7691525", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-19 18:55:19,702] {id_field.py:64} INFO - Values in the column UID are unique.\n" + ] + } + ], + "source": [ + "id_field = guess_id_field(polygons_gdf, use_id)\n", + "\n", + "# Set the ID field as the index.\n", + "polygons_gdf.set_index(id_field, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ad96502b-1ca9-4c7b-b2a6-1374588a2431", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20.4 s, sys: 1.29 s, total: 21.7 s\n", + "Wall time: 21.7 s\n" + ] + } + ], + "source": [ + "%%time\n", + "polygons_to_delete = []\n", + "for row in polygons_gdf.itertuples():\n", + " row_id = row.Index\n", + " row_geom = row.geometry\n", + "\n", + " polygons_to_check_against = polygons_gdf.loc[polygons_gdf.index != row_id]\n", + "\n", + " # Check if the row geometry is within any of the other polygons.\n", + " if polygons_to_check_against.geometry.contains(row_geom).any():\n", + " polygons_to_delete.append(row_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ef3db64-2cf9-4325-afcd-958a8d4719e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the parent directory of the polygons vector file.\n", + "dir_name = os.path.dirname(polygons_vector_file)\n", + "# Get the file name of the polygons vector file without the file extenstion.\n", + "base_name = os.path.splitext(os.path.basename(polygons_vector_file))[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a7c2ad6-604b-4485-ab6e-2f2d10611033", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-19 18:55:41,381] {2775308144.py:2} INFO - Found 1 polygons within polygons.\n" + ] + } + ], + "source": [ + "polygons_to_delete_gdf = polygons_gdf.loc[polygons_gdf.index.isin(polygons_to_delete)]\n", + "_log.info(f\"Found {len(polygons_to_delete_gdf)} polygons within polygons.\")\n", + "\n", + "polygons_to_delete_fp = os.path.join(dir_name, f\"{base_name}_polygons_to_delete.parquet\")\n", + "polygons_to_delete_gdf.to_parquet(polygons_to_delete_fp)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "066a49c1-126e-460d-b787-d8ca07a657f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-19 18:55:41,409] {421957777.py:2} INFO - Polygon count after handling polygons within polygons 11653.\n" + ] + } + ], + "source": [ + "polygons_within_polygons_removed = polygons_gdf.loc[~polygons_gdf.index.isin(polygons_to_delete)]\n", + "_log.info(f\"Polygon count after handling polygons within polygons {len(polygons_within_polygons_removed)}.\")\n", + "\n", + "polygons_within_polygons_removed_fp = os.path.join(dir_name, f\"{base_name}_polygons_within_polygons_removed.parquet\")\n", + "polygons_within_polygons_removed.to_parquet(polygons_within_polygons_removed_fp)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 3e7355b7..74241756 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,11 @@ classifiers=[ ] requires-python = ">=3.6.0" dependencies = [ - 'boto3', + 'aiobotocore[awscli,boto3]', + # 'boto3', + # 'botocore', + # 'types-aiobotocore[essential]', 'boto3-stubs[sqs,s3]', - 'botocore', 'click', 'datacube', 'deafrica-tools >= 2.1.2', @@ -36,6 +38,7 @@ dependencies = [ 'geopandas', 'numpy', 'odc-dscache', + 'odc-stats', 'pandas', 'python-dateutil', 'python-geohash', diff --git a/requirements.in b/requirements.in index 5106a8ab..a054f600 100644 --- a/requirements.in +++ b/requirements.in @@ -1,7 +1,9 @@ # Dependencies listed in pyproject.toml -boto3 +aiobotocore[awscli,boto3] >= 2.7.0 +#boto3 +#botocore +#types-aiobotocore[essential] boto3-stubs[sqs,s3] -botocore click datacube deafrica-tools >= 2.1.2 --extra-index-url="https://packages.dea.ga.gov.au" @@ -10,6 +12,7 @@ gdal geopandas numpy odc-dscache --extra-index-url="https://packages.dea.ga.gov.au" +odc-stats pandas python-dateutil python-geohash @@ -21,6 +24,7 @@ isort flake8 # Testing pytest +coverage jupyterlab moto[all] # Required by index_tiles.sh diff --git a/requirements.txt b/requirements.txt index 5e5d65be..4afffed1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,13 +12,16 @@ affine==2.4.0 # eodatasets3 # odc-algo # odc-geo + # odc-stac # rasterio # rasterstats -aiobotocore[boto3]==2.6.0 +aiobotocore[awscli,boto3]==2.7.0 # via + # -r requirements.in + # aiobotocore # deafrica-tools # odc-cloud -aiohttp==3.8.5 +aiohttp==3.8.6 # via # aiobotocore # dask-gateway @@ -27,7 +30,7 @@ aioitertools==0.11.0 # via aiobotocore aiosignal==1.3.1 # via aiohttp -annotated-types==0.5.0 +annotated-types==0.6.0 # via pydantic anyio==4.0.0 # via jupyter-server @@ -35,9 +38,9 @@ argon2-cffi==23.1.0 # via jupyter-server argon2-cffi-bindings==21.2.0 # via argon2-cffi -arrow==1.2.3 +arrow==1.3.0 # via isoduration -asttokens==2.4.0 +asttokens==2.4.1 # via stack-data async-lru==2.0.4 # via jupyterlab @@ -55,58 +58,58 @@ attrs==23.1.0 # rasterio # referencing # sarif-om -aws-sam-translator==1.76.0 +aws-sam-translator==1.79.0 # via cfn-lint -aws-xray-sdk==2.12.0 +aws-xray-sdk==2.12.1 # via moto -babel==2.12.1 +awscli==1.29.64 + # via aiobotocore +babel==2.13.1 # via jupyterlab-server -backcall==0.2.0 - # via ipython beautifulsoup4==4.12.2 # via nbconvert -black==23.9.1 +black==23.11.0 # via # -r requirements.in # deafrica-tools -bleach==6.0.0 +bleach==6.1.0 # via nbconvert -bokeh==3.2.2 +bokeh==3.3.1 # via dask -boltons==23.0.0 +boltons==23.1.1 # via eodatasets3 -boto3==1.28.17 +boto3==1.28.64 # via - # -r requirements.in # aiobotocore # aws-sam-translator # datacube # eodatasets3 # moto # odc-cloud -boto3-stubs[s3,sqs]==1.28.57 +boto3-stubs[s3,sqs]==1.28.83 # via -r requirements.in -botocore==1.31.17 +botocore==1.31.64 # via - # -r requirements.in # aiobotocore # aws-xray-sdk + # awscli # boto3 # datacube # eodatasets3 # moto # odc-cloud + # odc-stats # s3transfer -botocore-stubs==1.31.57 +botocore-stubs==1.31.83 # via boto3-stubs bottleneck==1.3.7 # via datacube -branca==0.6.0 +branca==0.7.0 # via # deafrica-tools # folium # ipyleaflet -cachetools==5.3.1 +cachetools==5.3.2 # via # datacube # odc-geo @@ -123,18 +126,19 @@ cffi==1.16.0 # via # argon2-cffi-bindings # cryptography -cfn-lint==0.80.3 +cfn-lint==0.83.1 # via moto -cftime==1.6.2 +cftime==1.6.3 # via netcdf4 -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 # via # aiohttp # requests -ciso8601==2.3.0 +ciso8601==2.3.1 # via # datacube # eodatasets3 + # odc-stats click==8.1.7 # via # -r requirements.in @@ -148,6 +152,8 @@ click==8.1.7 # eodatasets3 # fiona # odc-apps-dc-tools + # odc-dscache + # odc-stats # rasterio # rasterstats click-plugins==1.1.1 @@ -159,28 +165,32 @@ cligj==0.7.2 # fiona # rasterio # rasterstats -cloudpickle==2.2.1 +cloudpickle==3.0.0 # via # dask # dask-glm # datacube # distributed -comm==0.1.4 +colorama==0.4.4 + # via awscli +comm==0.2.0 # via # ipykernel # ipywidgets -contourpy==1.1.1 +contourpy==1.2.0 # via # bokeh # matplotlib -cryptography==41.0.4 +coverage==7.3.2 + # via -r requirements.in +cryptography==41.0.5 # via # moto # python-jose # sshpubkeys -cycler==0.12.0 +cycler==0.12.1 # via matplotlib -dask[array,complete,dataframe,diagnostics,distributed]==2023.9.2 +dask[array,complete,dataframe,diagnostics,distributed]==2023.10.1 # via # dask-gateway # dask-glm @@ -190,25 +200,29 @@ dask[array,complete,dataframe,diagnostics,distributed]==2023.9.2 # deafrica-tools # distributed # odc-algo + # odc-stac + # odc-stats dask-gateway==2023.9.0 # via deafrica-tools -dask-glm==0.3.0 +dask-glm==0.3.1 # via dask-ml dask-image==2023.8.1 # via odc-algo dask-ml==2023.3.24 # via deafrica-tools -datacube[performance,s3]==1.8.15 +datacube[performance,s3]==1.8.17 # via # -r requirements.in # deafrica-tools # eodatasets3 # odc-algo # odc-apps-dc-tools + # odc-dscache + # odc-stats # odc-ui datadog==0.47.0 # via odc-apps-dc-tools -deafrica-tools==2.1.2 +deafrica-tools==2.3.0 # via -r requirements.in debugpy==1.8.0 # via ipykernel @@ -220,7 +234,7 @@ defusedxml==0.7.1 # nbconvert deprecat==2.1.1 # via datacube -distributed==2023.9.2 +distributed==2023.10.1 # via # dask # dask-gateway @@ -228,37 +242,43 @@ distributed==2023.9.2 # dask-ml # datacube # odc-algo + # odc-stats docker==6.1.3 # via moto +docutils==0.16 + # via awscli ecdsa==0.18.0 # via # moto # python-jose # sshpubkeys eodatasets3==0.29.7 - # via odc-apps-dc-tools + # via + # odc-apps-dc-tools + # odc-stats exceptiongroup==1.1.3 # via # anyio # cattrs # ipython # pytest -executing==1.2.0 +executing==2.0.1 # via stack-data -fastjsonschema==2.18.0 +fastjsonschema==2.18.1 # via nbformat -fiona==1.9.4.post1 +fiona==1.9.5 # via # deafrica-tools # geopandas + # odc-stats # rasterstats flake8==6.0.0 # via # -r requirements.in # deafrica-tools -folium==0.14.0 +folium==0.15.0 # via deafrica-tools -fonttools==4.43.0 +fonttools==4.44.0 # via matplotlib fqdn==1.5.1 # via jsonschema @@ -266,21 +286,23 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.2 +fsspec==2023.10.0 # via # -r requirements.in # dask # deafrica-tools # odc-apps-dc-tools + # odc-stats gdal==3.6.3 # via # -r requirements.in # deafrica-tools -geoalchemy2==0.14.1 + # odc-stats +geoalchemy2==0.14.2 # via datacube geographiclib==2.0 # via geopy -geojson==3.0.1 +geojson==3.1.0 # via deafrica-tools geopandas==0.14.0 # via @@ -290,9 +312,9 @@ geopy==2.4.0 # via deafrica-tools graphql-core==3.2.3 # via moto -greenlet==2.0.2 +greenlet==3.0.1 # via sqlalchemy -h5py==3.9.0 +h5py==3.10.0 # via eodatasets3 hdstats==0.2.1 # via deafrica-tools @@ -300,28 +322,25 @@ idna==3.4 # via # anyio # jsonschema - # moto # requests # yarl -imageio==2.31.4 +imageio==2.32.0 # via # pims # scikit-image importlib-metadata==6.8.0 - # via - # dask - # moto -importlib-resources==6.1.0 + # via dask +importlib-resources==6.1.1 # via odc-apps-dc-tools iniconfig==2.0.0 # via pytest -ipykernel==6.25.2 +ipykernel==6.26.0 # via jupyterlab ipyleaflet==0.17.4 # via # deafrica-tools # odc-ui -ipython==8.16.0 +ipython==8.17.2 # via # deafrica-tools # ipykernel @@ -339,7 +358,7 @@ isort==5.12.0 # via # -r requirements.in # deafrica-tools -jedi==0.19.0 +jedi==0.19.1 # via ipython jinja2==3.1.2 # via @@ -370,14 +389,12 @@ jsondiff==2.0.0 jsonpatch==1.33 # via cfn-lint jsonpickle==3.0.2 - # via - # aws-xray-sdk - # jschema-to-python + # via jschema-to-python jsonpointer==2.4 # via # jsonpatch # jsonschema -jsonschema[format-nongpl]==4.19.1 +jsonschema[format-nongpl]==4.19.2 # via # aws-sam-translator # cfn-lint @@ -389,7 +406,7 @@ jsonschema[format-nongpl]==4.19.1 # openapi-schema-validator # openapi-spec-validator # pystac -jsonschema-spec==0.2.4 +jsonschema-path==0.3.1 # via openapi-spec-validator jsonschema-specifications==2023.7.1 # via @@ -397,12 +414,12 @@ jsonschema-specifications==2023.7.1 # openapi-schema-validator junit-xml==1.9 # via cfn-lint -jupyter-client==8.3.1 +jupyter-client==8.6.0 # via # ipykernel # jupyter-server # nbclient -jupyter-core==5.3.2 +jupyter-core==5.5.0 # via # ipykernel # jupyter-client @@ -411,11 +428,11 @@ jupyter-core==5.3.2 # nbclient # nbconvert # nbformat -jupyter-events==0.7.0 +jupyter-events==0.9.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.7.3 +jupyter-server==2.10.0 # via # jupyter-lsp # jupyterlab @@ -425,24 +442,26 @@ jupyter-server-terminals==0.4.4 # via jupyter-server jupyter-ui-poll==0.2.2 # via odc-ui -jupyterlab==4.0.6 +jupyterlab==4.0.8 # via -r requirements.in jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.25.0 +jupyterlab-server==2.25.1 # via jupyterlab jupyterlab-widgets==3.0.9 # via ipywidgets kiwisolver==1.4.5 # via matplotlib -lark==1.1.7 +lark==1.1.8 # via datacube lazy-loader==0.3 # via scikit-image lazy-object-proxy==1.9.0 # via openapi-spec-validator -llvmlite==0.41.0 +llvmlite==0.41.1 # via numba +lmdb==1.4.1 + # via odc-dscache locket==1.0.0 # via # distributed @@ -458,7 +477,7 @@ markupsafe==2.1.3 # jinja2 # nbconvert # werkzeug -matplotlib==3.8.0 +matplotlib==3.8.1 # via # deafrica-tools # odc-ui @@ -469,9 +488,9 @@ matplotlib-inline==0.1.6 # ipython mccabe==0.7.0 # via flake8 -mistune==3.0.1 +mistune==3.0.2 # via nbconvert -moto[all]==4.2.4 +moto[all]==4.2.7 # via -r requirements.in mpmath==1.3.0 # via sympy @@ -481,19 +500,21 @@ multidict==6.0.4 # via # aiohttp # yarl +multipart==0.2.4 + # via moto multipledispatch==1.0.0 # via # dask-glm # dask-ml mypy-boto3-s3==1.28.55 # via boto3-stubs -mypy-boto3-sqs==1.28.36 +mypy-boto3-sqs==1.28.82 # via boto3-stubs mypy-extensions==1.0.0 # via black -nbclient==0.8.0 +nbclient==0.9.0 # via nbconvert -nbconvert==7.8.0 +nbconvert==7.11.0 # via jupyter-server nbformat==5.9.2 # via @@ -502,17 +523,17 @@ nbformat==5.9.2 # nbconvert nest-asyncio==1.5.8 # via ipykernel -netcdf4==1.6.4 +netcdf4==1.6.5 # via # datacube # pytmd -networkx==3.1 +networkx==3.2.1 # via # cfn-lint # scikit-image notebook-shim==0.2.3 # via jupyterlab -numba==0.58.0 +numba==0.58.1 # via # dask-ml # sparse @@ -520,7 +541,7 @@ numexpr==2.8.7 # via # deafrica-tools # odc-algo -numpy==1.25.2 +numpy==1.26.1 # via # -r requirements.in # bokeh @@ -543,12 +564,13 @@ numpy==1.25.2 # numexpr # odc-algo # odc-geo + # odc-stac + # odc-stats # odc-ui # pandas # pims # pyarrow # pytmd - # pywavelets # rasterio # rasterstats # scikit-image @@ -563,26 +585,41 @@ numpy==1.25.2 odc-algo==0.2.3 # via # deafrica-tools + # odc-stats # odc-ui -odc-apps-dc-tools==0.2.13 +odc-apps-dc-tools==0.2.14 # via -r requirements.in odc-cloud[async]==0.2.3 - # via odc-apps-dc-tools + # via + # odc-apps-dc-tools + # odc-stats +odc-dscache==0.2.2 + # via + # -r requirements.in + # odc-stats odc-geo==0.4.1 - # via deafrica-tools + # via + # deafrica-tools + # odc-stac odc-io==0.2.1 - # via odc-apps-dc-tools + # via + # odc-apps-dc-tools + # odc-stats +odc-stac==0.3.7 + # via odc-stats +odc-stats==1.0.36 + # via -r requirements.in odc-ui==0.2.1.dev3676 # via deafrica-tools -openapi-schema-validator==0.6.1 +openapi-schema-validator==0.6.2 # via openapi-spec-validator -openapi-spec-validator==0.6.0 +openapi-spec-validator==0.7.1 # via moto overrides==7.4.0 # via jupyter-server -owslib==0.29.2 +owslib==0.29.3 # via deafrica-tools -packaging==23.1 +packaging==23.2 # via # black # bokeh @@ -604,7 +641,7 @@ packaging==23.1 # scikit-image # setuptools-scm # xarray -pandas==2.1.1 +pandas==2.1.2 # via # -r requirements.in # bokeh @@ -614,6 +651,9 @@ pandas==2.1.1 # datacube # deafrica-tools # geopandas + # odc-dscache + # odc-stac + # odc-stats # odc-ui # seaborn # xarray @@ -624,17 +664,15 @@ parso==0.8.3 partd==1.4.1 # via dask pathable==0.4.3 - # via jsonschema-spec + # via jsonschema-path pathspec==0.11.2 # via black -pbr==5.11.1 +pbr==6.0.0 # via # jschema-to-python # sarif-om pexpect==4.8.0 # via ipython -pickleshare==0.7.5 - # via ipython pillow==10.0.1 # via # bokeh @@ -643,31 +681,33 @@ pillow==10.0.1 # scikit-image pims==0.6.1 # via dask-image -platformdirs==3.10.0 +platformdirs==3.11.0 # via # black # jupyter-core pluggy==1.3.0 # via pytest -prometheus-client==0.17.1 +prometheus-client==0.18.0 # via jupyter-server prompt-toolkit==3.0.39 # via ipython -psutil==5.9.5 +psutil==5.9.6 # via # distributed # ipykernel -psycopg2==2.9.8 - # via datacube +psycopg2==2.9.9 + # via + # datacube + # odc-dscache ptyprocess==0.7.0 # via # pexpect # terminado pure-eval==0.2.2 # via stack-data -py-partiql-parser==0.3.7 +py-partiql-parser==0.4.1 # via moto -pyarrow==13.0.0 +pyarrow==14.0.1 # via dask pyasn1==0.5.0 # via @@ -700,17 +740,19 @@ pyproj==3.6.1 # geopandas # odc-geo # pytmd -pystac[validation]==1.8.4 +pystac[validation]==1.9.0 # via # eodatasets3 # odc-apps-dc-tools + # odc-stac + # odc-stats # pystac-client # rio-stac pystac-client==0.7.5 # via # deafrica-tools # odc-apps-dc-tools -pytest==7.4.2 +pytest==7.4.3 # via # -r requirements.in # deafrica-tools @@ -732,10 +774,12 @@ python-dateutil==2.8.2 python-geohash==0.8.5 # via -r requirements.in python-jose[cryptography]==3.3.0 - # via moto + # via + # moto + # python-jose python-json-logger==2.0.7 # via jupyter-events -python-rapidjson==1.11 +python-rapidjson==1.13 # via eodatasets3 pytmd==2.0.8 # via deafrica-tools @@ -744,17 +788,16 @@ pytz==2023.3.post1 # deafrica-tools # owslib # pandas -pywavelets==1.4.1 - # via scikit-image pyyaml==6.0.1 # via + # awscli # bokeh # cfn-lint # dask # dask-gateway # datacube # distributed - # jsonschema-spec + # jsonschema-path # jupyter-events # moto # odc-apps-dc-tools @@ -765,12 +808,14 @@ pyzmq==25.1.1 # ipykernel # jupyter-client # jupyter-server -rasterio==1.3.8 +rasterio==1.3.9 # via # datacube # deafrica-tools # eodatasets3 # odc-algo + # odc-stac + # odc-stats # odc-ui # rasterstats # rio-stac @@ -779,10 +824,10 @@ rasterstats==0.19.0 referencing==0.30.2 # via # jsonschema - # jsonschema-spec + # jsonschema-path # jsonschema-specifications # jupyter-events -regex==2023.8.8 +regex==2023.10.3 # via cfn-lint requests==2.31.0 # via @@ -790,14 +835,14 @@ requests==2.31.0 # deafrica-tools # docker # folium - # jsonschema-spec + # jsonschema-path # jupyterlab-server # moto # owslib # pystac-client # responses # urlpath -responses==0.23.3 +responses==0.24.0 # via moto rfc3339-validator==0.1.4 # via @@ -810,27 +855,31 @@ rfc3986-validator==0.1.1 # jupyter-events rio-stac==0.8.1 # via odc-apps-dc-tools -rpds-py==0.10.3 +rpds-py==0.12.0 # via # jsonschema # referencing -rsa==4.9 - # via python-jose -ruamel-yaml==0.17.33 +rsa==4.7.2 + # via + # awscli + # python-jose +ruamel-yaml==0.18.5 # via # datacube # eodatasets3 -ruamel-yaml-clib==0.2.7 +ruamel-yaml-clib==0.2.8 # via ruamel-yaml -s3transfer==0.6.2 - # via boto3 +s3transfer==0.7.0 + # via + # awscli + # boto3 sarif-om==1.0.4 # via cfn-lint -scikit-image==0.21.0 +scikit-image==0.22.0 # via # deafrica-tools # odc-algo -scikit-learn==1.3.1 +scikit-learn==1.3.2 # via # dask-glm # dask-ml @@ -851,9 +900,9 @@ seaborn==0.13.0 # via deafrica-tools send2trash==1.8.2 # via jupyter-server -setuptools-scm==8.0.3 +setuptools-scm==8.0.4 # via pytmd -shapely==2.0.1 +shapely==2.0.2 # via # -r requirements.in # datacube @@ -862,7 +911,7 @@ shapely==2.0.1 # geopandas # odc-geo # rasterstats -simplejson==3.19.1 +simplejson==3.19.2 # via rasterstats six==1.16.0 # via @@ -885,22 +934,22 @@ soupsieve==2.5 # via beautifulsoup4 sparse==0.14.0 # via dask-glm -sqlalchemy==1.4.49 +sqlalchemy==1.4.50 # via # -r requirements.in # datacube # geoalchemy2 sshpubkeys==3.3.1 # via moto -stack-data==0.6.2 +stack-data==0.6.3 # via ipython -structlog==23.1.0 +structlog==23.2.0 # via eodatasets3 sympy==1.12 # via cfn-lint -tblib==2.0.0 +tblib==3.0.0 # via distributed -terminado==0.17.1 +terminado==0.18.0 # via # jupyter-server # jupyter-server-terminals @@ -925,6 +974,9 @@ toolz==0.12.0 # distributed # odc-algo # odc-apps-dc-tools + # odc-dscache + # odc-stac + # odc-stats # partd tornado==6.3.3 # via @@ -940,7 +992,8 @@ tqdm==4.66.1 # via # -r requirements.in # deafrica-tools -traitlets==5.10.1 + # odc-stats +traitlets==5.13.0 # via # comm # ipykernel @@ -958,10 +1011,10 @@ traitlets==5.10.1 # traittypes traittypes==0.2.1 # via ipyleaflet -types-awscrt==0.19.1 +types-awscrt==0.19.9 # via botocore-stubs -types-pyyaml==6.0.12.12 - # via responses +types-python-dateutil==2.8.19.14 + # via arrow types-s3transfer==0.7.0 # via boto3-stubs typing-extensions==4.8.0 @@ -980,7 +1033,7 @@ tzdata==2023.3 # via pandas uri-template==1.3.0 # via jsonschema -urllib3==1.26.16 +urllib3==2.0.7 # via # botocore # distributed @@ -989,7 +1042,7 @@ urllib3==1.26.16 # responses urlpath==1.2.0 # via odc-apps-dc-tools -wcwidth==0.2.7 +wcwidth==0.2.9 # via prompt-toolkit webcolors==1.13 # via jsonschema @@ -997,29 +1050,31 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.6.3 +websocket-client==1.6.4 # via # docker # jupyter-server -werkzeug==2.3.7 +werkzeug==3.0.1 # via moto widgetsnbextension==4.0.9 # via ipywidgets -wrapt==1.15.0 +wrapt==1.16.0 # via # aiobotocore # aws-xray-sdk # deprecat -xarray==2023.9.0 +xarray==2023.10.1 # via # datacube # deafrica-tools # eodatasets3 # odc-algo + # odc-stac + # odc-stats # odc-ui xmltodict==0.13.0 # via moto -xyzservices==2023.7.0 +xyzservices==2023.10.1 # via # bokeh # ipyleaflet @@ -1028,9 +1083,9 @@ yarl==1.9.2 zict==3.0.0 # via distributed zipp==3.17.0 - # via - # importlib-metadata - # moto + # via importlib-metadata +zstandard==0.22.0 + # via odc-dscache # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..70639378 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,53 @@ +# DE Africa Waterbodies testing readme + +## Setting up Docker for testing + +- We use `docker compose` to manage the test infrastructure (packages and environment) . +- Use the command `make test-env` from the root directory to set up and launch the test environment. This sets up the test datacube and an environment to run `deafrica-waterbodies` in. +- You should have three containers running. You can check this by running `docker compose ps` . ![Expected Running Containers](../figures/ExpectedRunningContainers.png) +- Now you can run tests in docker dancing penguin . +- If the docker container needs rebuilding run `docker compose build` or `make build` +- Once you are done with testing, you can shut down the containers with `docker compose down` `make down` or `make clean` + +## Running tests in Docker + +- Once containers are up, you can run testing with the command `docker compose exec waterbodies pytest` +- If you want to run the tests interactively and have access to the interactive debugger, + Execute bash within the docker container waterbodies `docker compose exec waterbodies bash` or `make shell` and then run `pytest` from the code directory: + +```bash +root@fe004etc:/code# pytest tests +``` + +## Running tests in sandbox + +The tests assume that `deafrica-waterbodies` is installed. To install, follow the instructions in the [main README](../README.md). You can install `deafrica-waterbodies` locally for testing using `pip`: + +```bash +jovyan@jupyter-:~/dev/deafrica-waterbodies$ pip install -e . +``` + +Remember the dot (.)! + +To run tests, use `pytest` from the deafrica-waterbodies repository root, in the terminal: + +```bash +jovyan@jupyter-:~/dev/deafrica-waterbodies$ pytest tests +``` + +Tests are automatically triggered in GitHub for any pushes to any branch. This behaviour is controlled by `/.github/workflows/test.yml`. + +## Adding new test data + +- The docker test datacube needs to have datasets in it to run tests on. +- To add a new test dataset, first make sure the product is indexed in the test datacube. This is done with a line like the following: + +```bash +docker compose exec -T index datacube -v product add https://raw.githubusercontent.com/digitalearthafrica/config/master/products/wofs_ls_summary_alltime.odc-product.yaml +``` + +- Add the individual dataset with `s3-to-dc` inside the heredoc (with the others): + +```bash +s3-to-dc "s3://deafrica-services/wofs_ls/1-0-0/189/038/2023/01/*/*.json" --stac --no-sign-request --skip-lineage 'wofs_ls' +``` diff --git a/tests/data/sm9rtw98n.geojson b/tests/data/sm9rtw98n.geojson new file mode 100644 index 00000000..da86b853 --- /dev/null +++ b/tests/data/sm9rtw98n.geojson @@ -0,0 +1,7 @@ +{ +"type": "FeatureCollection", +"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC::CRS84" } }, +"features": [ +{ "type": "Feature", "properties": { "area_m2": 670500.0004, "UID": "sm9rtw98n", "WB_ID": 702846, "perim_m": 8160.0, "timeseries": "https://deafrica-waterbodies-dev.s3.af-south-1.amazonaws.com/waterbodies/v0.0.2/surface_area_change/sm9r/sm9rtw98n.csv" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ 13.251935888451037, 32.299845492052341 ], [ 13.251935888451037, 32.299568347080807 ], [ 13.251624963416701, 32.299568347080807 ], [ 13.251624963416701, 32.298459775506743 ], [ 13.251314038382363, 32.298459775506743 ], [ 13.251314038382363, 32.298736917153512 ], [ 13.251003113348025, 32.298736917153512 ], [ 13.251003113348025, 32.299014059631396 ], [ 13.250692188313687, 32.299014059631396 ], [ 13.250692188313687, 32.299291202940552 ], [ 13.250381263279355, 32.299291202940552 ], [ 13.250381263279355, 32.298459775506743 ], [ 13.250692188313687, 32.298459775506743 ], [ 13.250692188313687, 32.298182634691187 ], [ 13.251003113348025, 32.298182634691187 ], [ 13.251003113348025, 32.297905494706711 ], [ 13.249448488176343, 32.297905494706711 ], [ 13.249448488176343, 32.297628355553435 ], [ 13.249137563142007, 32.297628355553435 ], [ 13.249137563142007, 32.297074079740142 ], [ 13.248826638107673, 32.297074079740142 ], [ 13.248826638107673, 32.29679694308016 ], [ 13.248515713073337, 32.29679694308016 ], [ 13.248515713073337, 32.296519807251237 ], [ 13.247582937970325, 32.296519807251237 ], [ 13.247582937970325, 32.296242672253364 ], [ 13.247272012935992, 32.296242672253364 ], [ 13.247272012935992, 32.29596553808657 ], [ 13.246961087901656, 32.29596553808657 ], [ 13.246961087901656, 32.295688404750798 ], [ 13.247272012935992, 32.295688404750798 ], [ 13.247272012935992, 32.29485700972954 ], [ 13.247582937970325, 32.29485700972954 ], [ 13.247582937970325, 32.294302750536936 ], [ 13.247893863004661, 32.294302750536936 ], [ 13.247893863004661, 32.293194242123008 ], [ 13.248204788039001, 32.293194242123008 ], [ 13.248204788039001, 32.292362869537037 ], [ 13.248515713073337, 32.292362869537037 ], [ 13.248515713073337, 32.292085747003398 ], [ 13.249448488176343, 32.292085747003398 ], [ 13.249448488176343, 32.291808625300639 ], [ 13.249759413210683, 32.291808625300639 ], [ 13.249759413210683, 32.291531504428704 ], [ 13.250070338245019, 32.291531504428704 ], [ 13.250070338245019, 32.290977265177226 ], [ 13.250381263279355, 32.290977265177226 ], [ 13.250381263279355, 32.290423029248899 ], [ 13.251003113348025, 32.290423029248899 ], [ 13.251003113348025, 32.290145912530868 ], [ 13.251314038382363, 32.290145912530868 ], [ 13.251314038382363, 32.28903745396596 ], [ 13.251624963416701, 32.28903745396596 ], [ 13.251624963416701, 32.286820576708202 ], [ 13.251935888451037, 32.286820576708202 ], [ 13.251935888451037, 32.282664075117914 ], [ 13.251624963416701, 32.282664075117914 ], [ 13.251624963416701, 32.282386981655314 ], [ 13.251314038382363, 32.282386981655314 ], [ 13.251314038382363, 32.282109889023104 ], [ 13.251003113348025, 32.282109889023104 ], [ 13.251003113348025, 32.281832797221213 ], [ 13.249759413210683, 32.281832797221213 ], [ 13.249759413210683, 32.281555706249613 ], [ 13.250070338245019, 32.281555706249613 ], [ 13.250070338245019, 32.281278616108303 ], [ 13.251314038382363, 32.281278616108303 ], [ 13.251314038382363, 32.281555706249613 ], [ 13.251935888451037, 32.281555706249613 ], [ 13.251935888451037, 32.281832797221213 ], [ 13.252557738519709, 32.281832797221213 ], [ 13.252557738519709, 32.282109889023104 ], [ 13.252868663554045, 32.282109889023104 ], [ 13.252868663554045, 32.282386981655314 ], [ 13.254112363691391, 32.282386981655314 ], [ 13.254112363691391, 32.282109889023104 ], [ 13.254423288725727, 32.282109889023104 ], [ 13.254423288725727, 32.281832797221213 ], [ 13.255045138794401, 32.281832797221213 ], [ 13.255045138794401, 32.282664075117914 ], [ 13.254734213760065, 32.282664075117914 ], [ 13.254734213760065, 32.282941169410897 ], [ 13.254423288725727, 32.282941169410897 ], [ 13.254423288725727, 32.283495360487926 ], [ 13.254734213760065, 32.283495360487926 ], [ 13.254734213760065, 32.283772457272022 ], [ 13.254423288725727, 32.283772457272022 ], [ 13.254423288725727, 32.2840495548866 ], [ 13.254112363691391, 32.2840495548866 ], [ 13.254112363691391, 32.284326653331547 ], [ 13.253490513622719, 32.284326653331547 ], [ 13.253490513622719, 32.284880852712924 ], [ 13.253179588588383, 32.284880852712924 ], [ 13.253179588588383, 32.28543505541608 ], [ 13.252868663554045, 32.28543505541608 ], [ 13.252868663554045, 32.285989261441323 ], [ 13.253179588588383, 32.285989261441323 ], [ 13.253179588588383, 32.286266365699746 ], [ 13.254112363691391, 32.286266365699746 ], [ 13.254112363691391, 32.286543470788644 ], [ 13.253490513622719, 32.286543470788644 ], [ 13.253490513622719, 32.287097683458228 ], [ 13.253179588588383, 32.287097683458228 ], [ 13.253179588588383, 32.288206118764535 ], [ 13.253490513622719, 32.288206118764535 ], [ 13.253490513622719, 32.288483229667676 ], [ 13.253801438657051, 32.288483229667676 ], [ 13.253801438657051, 32.289314567361131 ], [ 13.254112363691391, 32.289314567361131 ], [ 13.254112363691391, 32.28903745396596 ], [ 13.254423288725727, 32.28903745396596 ], [ 13.254423288725727, 32.289591681586955 ], [ 13.255045138794401, 32.289591681586955 ], [ 13.255045138794401, 32.28986879664356 ], [ 13.254734213760065, 32.28986879664356 ], [ 13.254734213760065, 32.290423029248899 ], [ 13.255045138794401, 32.290423029248899 ], [ 13.255045138794401, 32.290145912530868 ], [ 13.255356063828733, 32.290145912530868 ], [ 13.255356063828733, 32.28986879664356 ], [ 13.255666988863073, 32.28986879664356 ], [ 13.255666988863073, 32.28903745396596 ], [ 13.255977913897409, 32.28903745396596 ], [ 13.255977913897409, 32.289314567361131 ], [ 13.257221614034753, 32.289314567361131 ], [ 13.257221614034753, 32.289591681586955 ], [ 13.256599763966083, 32.289591681586955 ], [ 13.256599763966083, 32.290145912530868 ], [ 13.255977913897409, 32.290145912530868 ], [ 13.255977913897409, 32.290423029248899 ], [ 13.256288838931745, 32.290423029248899 ], [ 13.256288838931745, 32.290700146797683 ], [ 13.255666988863073, 32.290700146797683 ], [ 13.255666988863073, 32.290977265177226 ], [ 13.255045138794401, 32.290977265177226 ], [ 13.255045138794401, 32.291531504428704 ], [ 13.255356063828733, 32.291531504428704 ], [ 13.255356063828733, 32.291808625300639 ], [ 13.255666988863073, 32.291808625300639 ], [ 13.255666988863073, 32.292085747003398 ], [ 13.255045138794401, 32.292085747003398 ], [ 13.255045138794401, 32.292362869537037 ], [ 13.254734213760065, 32.292362869537037 ], [ 13.254734213760065, 32.293194242123008 ], [ 13.254423288725727, 32.293194242123008 ], [ 13.254423288725727, 32.293471367980125 ], [ 13.254734213760065, 32.293471367980125 ], [ 13.254734213760065, 32.294302750536936 ], [ 13.254112363691391, 32.294302750536936 ], [ 13.254112363691391, 32.294579879717759 ], [ 13.253490513622719, 32.294579879717759 ], [ 13.253490513622719, 32.295134140572266 ], [ 13.253179588588383, 32.295134140572266 ], [ 13.253179588588383, 32.295411272246042 ], [ 13.252868663554045, 32.295411272246042 ], [ 13.252868663554045, 32.295688404750798 ], [ 13.252557738519709, 32.295688404750798 ], [ 13.252557738519709, 32.297074079740142 ], [ 13.252246813485369, 32.297074079740142 ], [ 13.252246813485369, 32.299845492052341 ], [ 13.251935888451037, 32.299845492052341 ] ] ] } } +] +} diff --git a/tests/test_generate_polygons_cli.py b/tests/test_generate_polygons_cli.py new file mode 100644 index 00000000..66061525 --- /dev/null +++ b/tests/test_generate_polygons_cli.py @@ -0,0 +1,69 @@ +import math +import os +from pathlib import Path + +import fsspec +import geopandas as gpd +import pytest +from click.testing import CliRunner + +from deafrica_waterbodies.cli.generate_polygons import generate_polygons + +# Test directory. +HERE = Path(__file__).parent.resolve() +TEST_WATERBODY = os.path.join(HERE, "data", "sm9rtw98n.geojson") +TEST_OUTPUT_DIRECTORY = HERE / "test_outputs" + + +@pytest.fixture +def runner(): + return CliRunner(echo_stdin=True) + + +def test_generate_polygons(runner, capsys: pytest.CaptureFixture): + aoi_vector_file = TEST_WATERBODY + tile_size_factor = 1 + num_workers = 8 + detection_threshold = 0.1 + extent_threshold = 0.05 + min_valid_observations = 60 + # raster_processing_plugin_name = "raster_processing_filtering" + output_directory = TEST_OUTPUT_DIRECTORY + min_polygon_size = 4500 + max_polygon_size = math.inf + length_threshold_km = 150 + timeseries_directory = TEST_OUTPUT_DIRECTORY + file_name_prefix = "waterbodies" + + args = [ + "--verbose", + f"--aoi-vector-file={aoi_vector_file}", + f"--tile-size-factor={tile_size_factor}", + f"--num-workers={num_workers}", + f"--detection-threshold={detection_threshold}", + f"--extent-threshold={extent_threshold}", + f"--min-valid-observations={min_valid_observations}", + f"--min-polygon-size={min_polygon_size}", + f"--max-polygon-size={max_polygon_size}", + f"--length-threshold-km={length_threshold_km}", + "--overwrite", + "--not-group-by-wofs-ls-regions", + f"--timeseries-directory={timeseries_directory}", + f"--file-name-prefix={file_name_prefix}", + f"--output-directory={output_directory}", + ] + + with capsys.disabled() as disabled: # noqa F841 + result = runner.invoke(generate_polygons, args=args, catch_exceptions=True) + + assert result.exit_code == 0 + + test_waterbodies = gpd.read_file( + os.path.join(output_directory, "historical_extent", "waterbodies.shp") + ) + + assert len(test_waterbodies) == 2 + + # File clean up. + fs = fsspec.filesystem("file") + fs.rm(output_directory, recursive=True) diff --git a/tests/test_generate_timeseries_cli.py b/tests/test_generate_timeseries_cli.py new file mode 100644 index 00000000..bcb78e00 --- /dev/null +++ b/tests/test_generate_timeseries_cli.py @@ -0,0 +1,51 @@ +import os +from pathlib import Path + +import fsspec +import pandas as pd +import pytest +from click.testing import CliRunner + +from deafrica_waterbodies.cli.generate_timeseries import generate_timeseries + +# Test directory. +HERE = Path(__file__).parent.resolve() +TEST_WATERBODY = os.path.join(HERE, "data", "sm9rtw98n.geojson") +TEST_OUTPUT_DIRECTORY = HERE / "test_outputs" + + +@pytest.fixture +def runner(): + return CliRunner(echo_stdin=True) + + +def test_generate_timeseries(runner, capsys: pytest.CaptureFixture): + waterbodies_vector_file = TEST_WATERBODY + use_id = "UID" + output_directory = TEST_OUTPUT_DIRECTORY + time_span = "custom" + temporal_range = "2023-01--P1M" + + args = [ + "--verbose", + f"--waterbodies-vector-file={waterbodies_vector_file}", + f"--use-id={use_id}", + f"--output-directory={output_directory}", + f"--time-span={time_span}", + f"--temporal-range={temporal_range}", + "--not-missing-only", + ] + + with capsys.disabled() as disabled: # noqa F841 + result = runner.invoke(generate_timeseries, args=args, catch_exceptions=True) + + assert result.exit_code == 0 + + test_timeseries = pd.read_csv(os.path.join(output_directory, "sm9r/sm9rtw98n.csv")) + + assert len(test_timeseries) == 9 + assert test_timeseries.iloc[3]["pc_wet"] == 49.66442953020135 + + # File clean up. + fs = fsspec.filesystem("file") + fs.rm(output_directory, recursive=True)