Skip to content

Commit

Permalink
Allow user to select metadata to save
Browse files Browse the repository at this point in the history
  • Loading branch information
rwood-97 committed Nov 20, 2023
1 parent 740314d commit 369677e
Showing 1 changed file with 108 additions and 33 deletions.
141 changes: 108 additions & 33 deletions mapreader/download/sheet_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,10 @@ def extract_published_dates(
----------
date_col : str or list, optional
A key or list of keys which map to the metadata field containing the publication date.
Multilayer keys should be passed as a list. e.g.:
- "key1" will extract ``self.features[i]["key1"]``
- ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]
- ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]``
If None, ["properties"]["WFS_TITLE"] will be used as keys. Date will then be extracted by regex searching for "Published: XXX".
By default None.
Expand Down Expand Up @@ -449,7 +450,7 @@ def query_map_sheets_by_string(
A key or list of keys used to get the metadata field to search in.
Key(s) will be passed to each features dictionary.
i.e. ["key1","key2"] will search for ``self.features[i]["key1"]["key2"].
Multilayer keys should be passed as a list. e.g. ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]``.
If ``None``, will search in all metadata fields. By default ``None``.
append : bool, optional
Expand Down Expand Up @@ -586,7 +587,8 @@ def _save_metadata(
self,
feature: dict,
out_filepath: str,
**kwargs,
metadata_to_save: dict | None = None,
**kwargs: dict | None,
) -> None:
"""
Creates list of selected metadata items and saves to a csv file.
Expand All @@ -598,43 +600,75 @@ def _save_metadata(
The feature for which to extract the metadata from
out_filepath : str
The path to save metadata csv.
kwargs: keyword arguments to pass to the ``extract_published_dates()`` method.
metadata_to_save : dict, optional
A dictionary containing column names (str) and metadata keys (str or list) to save to metadata csv.
Multilayer keys should be passed as a list, i.e. ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]``.
e.g. ``{"county": ["properties", "COUNTY"], "id": "id"}``
**kwargs: dict, optional
Keyword arguments to pass to the ``extract_published_dates()`` method.
Returns
-------
list
List of selected metadata (to be saved)
"""
map_name = str("map_" + feature["properties"]["IMAGE"] + ".png")
map_url = str(feature["properties"]["IMAGEURL"])
Notes
-----
Default metadata items are: name, url, coordinates, crs, published_date, grid_bb.
Additional items can be added using ``metadata_to_save``.
"""
metadata_cols = [
"name",
"url",
"coordinates",
"crs",
"published_date",
"grid_bb",
]
metadata_dict = {col: None for col in metadata_cols}

# get default metadata
metadata_dict["name"] = str("map_" + feature["properties"]["IMAGE"] + ".png")
metadata_dict["url"] = str(feature["properties"]["IMAGEURL"])
if not self.published_dates:
self.extract_published_dates(**kwargs)

published_date = feature["properties"]["published_date"]

grid_bb = feature["grid_bb"]
date_col = kwargs.get("date_col", None)
self.extract_published_dates(date_col=date_col)
metadata_dict["published_date"] = feature["properties"]["published_date"]
metadata_dict["grid_bb"] = feature["grid_bb"]
polygon = get_polygon_from_grid_bb(
metadata_dict["grid_bb"]
) # use grid_bb to get coords of actually downloaded tiles
metadata_dict["coordinates"] = polygon.bounds
metadata_dict["crs"] = self.crs

if metadata_to_save:
for col, metadata_key in metadata_to_save.items():
if isinstance(metadata_key, str):
metadata_key = [metadata_key]
if not isinstance(metadata_key, list):
raise ValueError(
"[ERROR] Please pass ``metadata_to_save`` metadata key(s) as a string or list of strings."
)

# use grid_bb to get coords of actually downloaded tiles
polygon = get_polygon_from_grid_bb(grid_bb)
coords = polygon.bounds
try:
metadatum = reduce(lambda d, key: d[key], metadata_key, feature)
except KeyError as err:
raise KeyError(
f"[ERROR] {metadata_key} not found in features dictionary."
) from err

crs = self.crs
metadata_dict[col] = metadatum

metadata_to_save = [map_name, map_url, coords, crs, published_date, grid_bb]
new_metadata_df = pd.DataFrame(
metadata_to_save,
index=["name", "url", "coordinates", "crs", "published_date", "grid_bb"],
).T
new_metadata_df = pd.DataFrame.from_dict(metadata_dict, orient="index").T

if os.path.exists(out_filepath):
existing_metadata_df = pd.read_csv(out_filepath, sep=",", index_col=0)
metadata_df = pd.concat(
[existing_metadata_df, new_metadata_df], ignore_index=True
)
metadata_df = metadata_df.loc[
metadata_df.astype(str).drop_duplicates().index
metadata_df.astype(str).drop_duplicates(subset=metadata_cols).index
] # https://stackoverflow.com/questions/43855462/pandas-drop-duplicates-method-not-working-on-dataframe-containing-lists
else:
metadata_df = new_metadata_df
Expand All @@ -647,6 +681,7 @@ def _download_map_sheets(
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
):
"""Download map sheets from a list of features.
Expand All @@ -660,6 +695,8 @@ def _download_map_sheets(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_save_metadata()`` method.
"""

for feature in tqdm(features):
Expand All @@ -669,13 +706,16 @@ def _download_map_sheets(
success = self._download_map(feature)
if success:
metadata_path = f"{path_save}/{metadata_fname}"
self._save_metadata(feature, metadata_path)
self._save_metadata(
feature=feature, out_filepath=metadata_path, **kwargs
)

def download_all_map_sheets(
self,
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads all map sheets in metadata.
Expand All @@ -688,6 +728,8 @@ def download_all_map_sheets(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
"""
if not self.grid_bbs:
raise ValueError("[ERROR] Please first run ``get_grid_bb()``")
Expand All @@ -696,14 +738,17 @@ def download_all_map_sheets(
self._initialise_merger(path_save)

features = self.features
self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_wfs_ids(
self,
wfs_ids: list | int,
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads map sheets by WFS ID numbers.
Expand All @@ -718,6 +763,8 @@ def download_map_sheets_by_wfs_ids(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
"""

if not self.wfs_id_nos:
Expand Down Expand Up @@ -746,7 +793,9 @@ def download_map_sheets_by_wfs_ids(
if wfs_id_no in requested_maps:
features.append(feature)

self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_polygon(
self,
Expand All @@ -755,6 +804,7 @@ def download_map_sheets_by_polygon(
metadata_fname: str | None = "metadata.csv",
mode: str | None = "within",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads any map sheets which are found within or intersecting with a defined polygon.
Expand All @@ -774,6 +824,8 @@ def download_map_sheets_by_polygon(
By default "within".
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
Notes
-----
Expand Down Expand Up @@ -816,14 +868,17 @@ def download_map_sheets_by_polygon(
if map_polygon.intersects(polygon):
features.append(feature)

self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_coordinates(
self,
coords: tuple,
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads any maps sheets which contain a defined set of coordinates.
Expand All @@ -839,6 +894,8 @@ def download_map_sheets_by_coordinates(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
"""

if not isinstance(coords, tuple):
Expand All @@ -864,14 +921,17 @@ def download_map_sheets_by_coordinates(
if map_polygon.contains(coords):
features.append(feature)

self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_line(
self,
line: LineString,
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads any maps sheets which intersect with a line.
Expand All @@ -886,6 +946,8 @@ def download_map_sheets_by_line(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
Notes
-----
Expand Down Expand Up @@ -917,7 +979,9 @@ def download_map_sheets_by_line(
if map_polygon.intersects(line):
features.append(feature)

self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_string(
self,
Expand All @@ -926,6 +990,7 @@ def download_map_sheets_by_string(
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Download map sheets by searching for a string in a chosen metadata field.
Expand All @@ -938,8 +1003,8 @@ def download_map_sheets_by_string(
keys : str or list, optional
A key or list of keys used to get the metadata field to search in.
Key(s) will be passed to each features dictionary. \
i.e. ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]
Key(s) will be passed to each features dictionary.
Multilayer keys should be passed as a list. e.g. ["key1","key2"] will search for ``self.features[i]["key1"]["key2"]``.
If ``None``, will search in all metadata fields. By default ``None``.
path_save : str, optional
Expand All @@ -948,6 +1013,8 @@ def download_map_sheets_by_string(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
Notes
-----
Expand Down Expand Up @@ -979,13 +1046,16 @@ def download_map_sheets_by_string(
if match:
features.append(feature)

self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def download_map_sheets_by_queries(
self,
path_save: str | None = "maps",
metadata_fname: str | None = "metadata.csv",
overwrite: bool | None = False,
**kwargs: dict | None,
) -> None:
"""
Downloads map sheets saved as query results.
Expand All @@ -998,6 +1068,8 @@ def download_map_sheets_by_queries(
Name to use for metadata file, by default "metadata.csv"
overwrite : bool, optional
Whether to overwrite existing maps, by default ``False``.
**kwargs : dict, optional
Keyword arguments to pass to the ``_download_map_sheets()`` method.
"""
if not self.grid_bbs:
raise ValueError("[ERROR] Please first run ``get_grid_bb()``")
Expand All @@ -1009,15 +1081,18 @@ def download_map_sheets_by_queries(
raise ValueError("[ERROR] No query results found/saved.")

features = self.found_queries
self._download_map_sheets(features, path_save, metadata_fname, overwrite)
self._download_map_sheets(
features, path_save, metadata_fname, overwrite, **kwargs
)

def hist_published_dates(self, **kwargs) -> None:
"""
Plots a histogram of the publication dates of maps in metadata.
Parameters
----------
kwargs : A dictionary containing keyword arguments to pass to plotting function.
**kwargs : dict, optional
A dictionary containing keyword arguments to pass to plotting function.
See matplotlib.pyplot.hist() for acceptable values.
e.g. ``**dict(fc='c', ec='k')``
Expand Down

0 comments on commit 369677e

Please sign in to comment.