Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Error running OMDownloader #19 #25

Merged
merged 2 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker-build-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:

jobs:
docker:
runs-on: self-hosted
runs-on: ubuntu-latest

env:
DOCKER_TAG: latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/poetry-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
publish:
runs-on: ubuntu-latest
container:
image: ghcr.io/osgeo/gdal:ubuntu-full-3.7.2
image: ghcr.io/osgeo/gdal:ubuntu-full-3.8.5

env:
POETRY_NO_INTERACTION: 1
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG GDAL_VERSION=3.8.4
ARG GDAL_VERSION=3.8.5
FROM ghcr.io/osgeo/gdal:ubuntu-full-${GDAL_VERSION}

LABEL maintainer="Youssef Harby <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion config.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
global_variables:
release: "2024-02-15-alpha.0"
release: "2024-04-16-beta.0"
s3_region: "us-west-2"
default_theme: "places"
default_type: "*"
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ services:
args:
- OS_ARCH=linux
- PLATFORM_ARCH=aarch64 # amd64, i386, rpi, aarch64
- GDAL_VERSION=3.7.2
- GDAL_VERSION=3.8.5
image: ghcr.io/youssef-harby/overturemapsdownloader:latest
command: ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--NotebookApp.token=''", "--allow-root"]
restart: unless-stopped
Expand Down
80 changes: 51 additions & 29 deletions overturemapsdownloader/dask_qrys.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import logging
import dask.dataframe as dd
import dask
import geopandas as gpd
import dask_geopandas as dgpd
from overturemapsdownloader.utils_helper import read_geospatial_data
from dask.diagnostics import ProgressBar
from shapely.geometry import Polygon, box

ProgressBar().register()
from dask.distributed import Client, LocalCluster

# cluster = LocalCluster()
# client = Client(cluster)
dask.config.set({"dataframe.query-planning": False})


def compute_dataframe(df):
try:
result = df.compute()
return result
except Exception as e:
logging.error(f"Error computing DataFrame: {str(e)}")
return None


def get_df_from_parquet(
parquet_path,
engine="pyarrow",
# columns=["geometry"], # comment to get all columns by default
storage_options={"anon": True},
parquet_file_extensions=False,
):
Expand All @@ -22,7 +36,6 @@ def get_df_from_parquet(
logging.info(f"Reading Parquet file from {parquet_path}")
df = dd.read_parquet(
parquet_path,
# columns=columns, # comment to get all columns by default
engine=engine,
index="id",
dtype_backend=engine,
Expand All @@ -35,48 +48,57 @@ def get_df_from_parquet(
return None


def make_gdf_from_df(df, crs=4326):
"""
Converts a Dask DataFrame to a Dask GeoDataFrame.
"""
geometry = (
df["geometry"]
.map_partitions(gpd.GeoSeries.from_wkb, meta=gpd.GeoSeries(name="geometry"))
.set_crs(crs)
)
return dgpd.from_dask_dataframe(df, geometry=geometry)
def make_gdf_from_df(df, crs="EPSG:4326"):
try:
if "geometry" in df.columns:
# Ensure the 'geometry' column is processed as expected
geometry = df["geometry"].map_partitions(
gpd.GeoSeries.from_wkb, meta=gpd.GeoSeries()
)
df["geometry"] = (
geometry # Explicitly assigning the processed column back to the DataFrame
)

# Convert to GeoDataFrame
gdf = dgpd.from_dask_dataframe(df, geometry="geometry")
gdf.crs = crs

# Debug output
print("Conversion successful, GeoDataFrame created.")
return gdf
else:
raise ValueError("Geometry column missing in DataFrame")
except Exception as e:
logging.error(f"Failed to convert DataFrame to GeoDataFrame: {str(e)}")
return None


def get_clipped_gdf(gdf, bbox_filter):
"""
Clips the GeoDataFrame using a bounding box.
"""
return gdf[gdf.geometry.within(bbox_filter)]
if isinstance(bbox_filter, tuple):
bbox_filter = box(*bbox_filter) # Create Polygon from tuple
elif isinstance(bbox_filter, Polygon):
bbox_filter = gpd.GeoSeries(
[bbox_filter]
) # Convert Polygon to GeoSeries if not already

local_gdf = gdf.compute() # Compute to get GeoDataFrame

clipped_gdf = local_gdf[local_gdf.geometry.within(bbox_filter.iloc[0])]
return clipped_gdf


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

# TODO: Handle columns with official schemas
schema_yaml_path = "overturemapsdownloader/schemas/schema/places/place.yaml"

bbox_filter = read_geospatial_data(
"examples/bbox.geojson", as_shapely_str=True, output_format="Custom"
)
bbox_filter = (31.429, 29.998, 31.531, 30.102) # Example bbox coordinates

df = get_df_from_parquet(
parquet_path="s3://overturemaps-us-west-2/release/2023-07-26-alpha.0/theme=places/type=*/*",
# columns=get_columns_from_om_schema_yaml(schema_yaml_path),
)

if df is not None:
gdf = make_gdf_from_df(df)

# TODO: Add filter by country (also in config)
clipped_gdf = get_clipped_gdf(gdf, bbox_filter)

print(clipped_gdf.head())
else:
logging.error("Could not read the DataFrame from the Parquet file.")

# TODO: Write to file; Parquet by default. Allow user to convert to other formats (e.g., via ogr2ogr).
2 changes: 1 addition & 1 deletion overturemapsdownloader/schemas
Submodule schemas updated 298 files
Loading
Loading