From 05dfcb4f1aa554d9480c2e2a93107e165d01ad5b Mon Sep 17 00:00:00 2001 From: Martijn Visser Date: Thu, 17 Oct 2024 11:06:36 +0200 Subject: [PATCH] Make AGV preprocessing reproducible (#165) Instead of local paths, this uses the CloudStorage class from the `ribasim_nl` package to load AGV data, such that this code can also run on other machines. I uncommented the first processing part since that was needed for the second part to run. This also adds an upload of the processed data to the cloud to `AmstelGooienVecht/verwerkt/preprocessed.gpkg`, so the result of this step is directly accesible as well. It should not change and results, but it's good to go over this with @rbruijnshkv tomorrrow. --- .../preprocess_data/AmstelGooienVecht.ipynb | 186 +++++++++--------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/src/peilbeheerst_model/peilbeheerst_model/preprocess_data/AmstelGooienVecht.ipynb b/src/peilbeheerst_model/peilbeheerst_model/preprocess_data/AmstelGooienVecht.ipynb index ef0bc05..8c71596 100644 --- a/src/peilbeheerst_model/peilbeheerst_model/preprocess_data/AmstelGooienVecht.ipynb +++ b/src/peilbeheerst_model/peilbeheerst_model/preprocess_data/AmstelGooienVecht.ipynb @@ -7,12 +7,12 @@ "metadata": {}, "outputs": [], "source": [ - "# import packages and functions\n", - "import os\n", - "\n", "import geopandas as gpd\n", "import numpy as np\n", - "import pandas as pd" + "import pandas as pd\n", + "from general_functions import show_layers_and_columns, store_data\n", + "from ribasim_nl import CloudStorage\n", + "from shapely import wkt" ] }, { @@ -22,19 +22,19 @@ "metadata": {}, "outputs": [], "source": [ - "from general_functions import *\n", - "\n", "%load_ext autoreload\n", "%autoreload 2\n", "pd.set_option(\"display.max_columns\", None)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "2", "metadata": {}, + "outputs": [], "source": [ - "# Amstel, Gooi en Vecht" + "cloud = CloudStorage()" ] }, { @@ -44,11 +44,8 @@ "metadata": {}, "outputs": [], "source": [ - "# define relative paths\n", "waterschap = \"AVG\"\n", - "path_AVG = \"..\\..\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\"\n", - "output_gpkg_path = \"../../Data_postprocessed/Waterschappen/AmstelGooienVecht\"\n", - "DM_path = \"..\\..\\Data_overig\\DM_Netwerk\\DM_Netwerk.shp\"" + "waterschap_long = \"AmstelGooienVecht\"" ] }, { @@ -58,58 +55,95 @@ "metadata": {}, "outputs": [], "source": [ - "# #AVG has delivered all data in CSV format. Load it in manually with some data mutations\n", - "# AVG = {}\n", - "# variables = ['stuw', 'gemaal', 'afsluitmiddel', 'duikersifonhevel', 'hydroobject']#, 'peilgebiedpraktijk', 'peilafwijkinggebied']\n", - "# for variable in variables:\n", - "# path_variable = os.path.join(path_AVG, variable + '.csv')\n", - "# df_var = pd.read_csv(path_variable, delimiter=';')\n", - "# geom_col = df_var.keys()[-1] #retrieve the column name\n", + "cloud.download_aangeleverd(waterschap_long)" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "# Amstel, Gooi en Vecht" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "# define paths\n", + "aangeleverd_dir = cloud.joinpath(waterschap_long, \"aangeleverd\")\n", + "verwerkt_dir = cloud.joinpath(waterschap_long, \"verwerkt\")\n", "\n", - "# if not 'geometrie' in geom_col:\n", - "# raise ValueError('No \"geometry\" string found in the last column of the dataframe. Check for existence')\n", + "output_gpkg_path = verwerkt_dir / \"preprocessed\"\n", "\n", - "# df_var['geometry'] = df_var[geom_col].apply(lambda x: wkt.loads(x.split(';')[-1]))\n", - "# AVG[variable] = df_var\n", + "# pyogrio needs the exclamation mark to read the file from the zip\n", + "dump_path = (\n", + " aangeleverd_dir / \"aanlevering_6maart24/data dump 6 maart LHM AGV.zip!/data dump 6 maart LHM AGV/\"\n", + ").as_posix()\n", "\n", - "# #there is one last gpkg which contains the streefpeilen (and peilgebieden)\n", - "# AVG['peilgebied'] = gpd.read_file(os.path.join(path_AVG, 'vigerende_peilgebieden.gpkg'))" + "verwerkt_dir.mkdir(parents=True, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "7", "metadata": {}, "outputs": [], "source": [ - "# AVG['peilgebied']['streefpeil'] = np.nan\n", - "# AVG['peilgebied']['streefpeil'] = AVG['peilgebied']['streefpeil'].fillna(value=AVG['peilgebied']['GPGZMRPL'])\n", - "# AVG['peilgebied']['streefpeil'] = AVG['peilgebied']['streefpeil'].fillna(value=AVG['peilgebied']['IWS_GPGVASTP'])\n", - "# AVG['peilgebied']['streefpeil'] = AVG['peilgebied']['streefpeil'].fillna(value=AVG['peilgebied']['IWS_GPGONDP'])\n", + "# AVG has delivered all data in CSV format. Load it in manually with some data mutations\n", + "AVG = {}\n", + "variables = [\n", + " \"stuw\",\n", + " \"gemaal\",\n", + " \"afsluitmiddel\",\n", + " \"duikersifonhevel\",\n", + " \"hydroobject\",\n", + "] # , 'peilgebiedpraktijk', 'peilafwijkinggebied']\n", + "for variable in variables:\n", + " path_variable = aangeleverd_dir / \"Eerste_levering\" / (variable + \".csv\")\n", + " df_var = pd.read_csv(path_variable, delimiter=\";\")\n", + " geom_col = df_var.keys()[-1] # retrieve the column name\n", "\n", + " if \"geometrie\" not in geom_col:\n", + " raise ValueError('No \"geometry\" string found in the last column of the dataframe. Check for existence')\n", "\n", - "# print('Number of missing streefpeilen = ', len(AVG['peilgebied']['streefpeil'].loc[AVG['peilgebied']['streefpeil'].isna()]))\n", + " df_var[\"geometry\"] = df_var[geom_col].apply(lambda x: wkt.loads(x.split(\";\")[-1]))\n", + " AVG[variable] = df_var\n", "\n", - "# fig, ax = plt.subplots()\n", - "# AVG['peilgebied'].geometry.plot(ax=ax, color='cornflowerblue')\n", - "# AVG['peilgebied'].loc[AVG['peilgebied']['streefpeil'].isna()].geometry.plot(ax=ax, color='red')\n", - "# ax.legend()" + "# there is one last gpkg which contains the streefpeilen (and peilgebieden)\n", + "AVG[\"peilgebied\"] = gpd.read_file(aangeleverd_dir / \"Na_levering\" / \"vigerende_peilgebieden.gpkg\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "8", "metadata": {}, "outputs": [], "source": [ - "AVG = {}" + "AVG[\"peilgebied\"][\"streefpeil\"] = np.nan\n", + "AVG[\"peilgebied\"][\"streefpeil\"] = AVG[\"peilgebied\"][\"streefpeil\"].fillna(value=AVG[\"peilgebied\"][\"GPGZMRPL\"])\n", + "AVG[\"peilgebied\"][\"streefpeil\"] = AVG[\"peilgebied\"][\"streefpeil\"].fillna(value=AVG[\"peilgebied\"][\"IWS_GPGVASTP\"])\n", + "AVG[\"peilgebied\"][\"streefpeil\"] = AVG[\"peilgebied\"][\"streefpeil\"].fillna(value=AVG[\"peilgebied\"][\"IWS_GPGONDP\"])\n", + "\n", + "print(\n", + " \"Number of missing streefpeilen = \",\n", + " len(AVG[\"peilgebied\"][\"streefpeil\"].loc[AVG[\"peilgebied\"][\"streefpeil\"].isna()]),\n", + ")\n", + "\n", + "# fig, ax = plt.subplots()\n", + "# AVG['peilgebied'].geometry.plot(ax=ax, color='cornflowerblue')\n", + "# AVG['peilgebied'].loc[AVG['peilgebied']['streefpeil'].isna()].geometry.plot(ax=ax, color='red')\n", + "# ax.legend()" ] }, { "cell_type": "markdown", - "id": "7", + "id": "9", "metadata": {}, "source": [ "# Nalevering" @@ -118,37 +152,26 @@ { "cell_type": "code", "execution_count": null, - "id": "8", + "id": "10", "metadata": {}, "outputs": [], "source": [ "# overwrite previous data\n", - "AVG[\"stuw\"] = gpd.read_file(\n", - " r\"D:\\Users\\Bruijns\\Documents\\PR4750_20\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\\data dump 6 maart LHM AGV\\Stuw.shp\"\n", - ")\n", + "AVG[\"stuw\"] = gpd.read_file(dump_path + \"/Stuw.shp\")\n", "AVG[\"stuw\"] = AVG[\"stuw\"].loc[AVG[\"stuw\"].LHM == \"LHM\"]\n", "\n", - "AVG[\"gemaal\"] = gpd.read_file(\n", - " r\"D:\\Users\\Bruijns\\Documents\\PR4750_20\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\\data dump 6 maart LHM AGV\\Gemaal.shp\"\n", - ")\n", + "AVG[\"gemaal\"] = gpd.read_file(dump_path + \"/Gemaal.shp\")\n", "AVG[\"gemaal\"] = AVG[\"gemaal\"].loc[AVG[\"gemaal\"].LHM == \"LHM\"]\n", "\n", - "AVG[\"duikersifonhevel\"] = gpd.read_file(\n", - " r\"D:\\Users\\Bruijns\\Documents\\PR4750_20\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\\data dump 6 maart LHM AGV\\Duikersifonhevel.shp\"\n", - ")\n", - "AVG[\"hydroobject\"] = gpd.read_file(\n", - " r\"D:\\Users\\Bruijns\\Documents\\PR4750_20\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\\data dump 6 maart LHM AGV\\LHM_hydrovakken.shp\"\n", - ")\n", - "\n", - "AVG[\"peilgebied\"] = gpd.read_file(\n", - " r\"D:\\Users\\Bruijns\\Documents\\PR4750_20\\Data_preprocessed\\Waterschappen\\AmstelGooienVecht\\data dump 6 maart LHM AGV\\LHM_gebieden.shp\"\n", - ")" + "AVG[\"duikersifonhevel\"] = gpd.read_file(dump_path + \"/DuikerSifonHevel.shp\")\n", + "AVG[\"hydroobject\"] = gpd.read_file(dump_path + \"/LHM_hydrovakken.shp\")\n", + "AVG[\"peilgebied\"] = gpd.read_file(dump_path + \"/LHM_gebieden.shp\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "9", + "id": "11", "metadata": {}, "outputs": [], "source": [ @@ -160,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10", + "id": "12", "metadata": {}, "outputs": [], "source": [ @@ -170,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11", + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -192,9 +215,7 @@ "AVG[\"gemaal\"].loc[AVG[\"gemaal\"].functiegemaal.str.contains(\"anvoergemaal|pmaling|an-|p-|pvoer\"), \"func_aanvoer\"] = True\n", "AVG[\"gemaal\"].loc[AVG[\"gemaal\"].functiegemaal.str.contains(\"irculatie\"), \"func_circulatie\"] = True\n", "AVG[\"gemaal\"].loc[\n", - " (AVG[\"gemaal\"].func_afvoer is False)\n", - " & (AVG[\"gemaal\"].func_aanvoer is False)\n", - " & (AVG[\"gemaal\"].func_circulatie is False),\n", + " ~AVG[\"gemaal\"].func_afvoer & ~AVG[\"gemaal\"].func_aanvoer & ~AVG[\"gemaal\"].func_circulatie,\n", " \"func_afvoer\",\n", "] = True # set to afvoergemaal is there the function is unknown" ] @@ -202,7 +223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +240,7 @@ "AVG[\"gemaal\"] = gpd.GeoDataFrame(AVG[\"gemaal\"]).to_crs(\"epsg:28992\")\n", "\n", "# afsluitmiddel\n", - "AVG[\"afsluitmiddel\"] = AVG[\"afsluitmiddel\"][[\"code\", \"geometry\"]]\n", + "AVG[\"afsluitmiddel\"] = AVG[\"afsluitmiddel\"][[\"code\", \"geometry\"]].copy()\n", "AVG[\"afsluitmiddel\"].loc[:, \"nen3610id\"] = \"dummy_nen3610id_afsluitmiddel_\" + AVG[\"afsluitmiddel\"].index.astype(str)\n", "AVG[\"afsluitmiddel\"][\"globalid\"] = \"dummy_globalid_afsluitmiddel_\" + AVG[\"afsluitmiddel\"].index.astype(str)\n", "AVG[\"afsluitmiddel\"] = gpd.GeoDataFrame(AVG[\"afsluitmiddel\"]).set_crs(\"epsg:28992\")\n", @@ -252,7 +273,6 @@ "AVG[\"peilgebied\"][\"nen3610id\"] = \"dummy_nen3610id_peilgebied_\" + AVG[\"peilgebied\"].index.astype(str)\n", "AVG[\"peilgebied\"][\"globalid\"] = \"dummy_globalid_peilgebied_\" + AVG[\"peilgebied\"].index.astype(str)\n", "\n", - "\n", "AVG[\"peilgebied\"] = AVG[\"peilgebied\"][[\"code\", \"nen3610id\", \"globalid\", \"geometry\"]]\n", "AVG[\"peilgebied\"] = gpd.GeoDataFrame(AVG[\"peilgebied\"]).to_crs(\"epsg:28992\")" ] @@ -260,7 +280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13", + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -270,17 +290,9 @@ "AVG[\"hydroobject\"] = gpd.GeoDataFrame(AVG[\"hydroobject\"]).set_crs(\"epsg:28992\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", - "id": "15", + "id": "16", "metadata": {}, "source": [ "# Control, store" @@ -289,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "17", "metadata": {}, "outputs": [], "source": [ @@ -299,32 +311,20 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "18", "metadata": {}, "outputs": [], "source": [ - "# Check if the directory exists\n", - "if not os.path.exists(output_gpkg_path):\n", - " # If it doesn't exist, create it\n", - " os.makedirs(output_gpkg_path)\n", - "\n", - "store_data(waterschap=AVG, output_gpkg_path=output_gpkg_path + \"/AGV\")" + "store_data(waterschap=AVG, output_gpkg_path=str(output_gpkg_path))\n", + "cloud.upload_verwerkt(waterschap_long)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Routing", + "display_name": "default", "language": "python", - "name": "routing" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -336,7 +336,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.7" } }, "nbformat": 4,