Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implements a better multiple dfs return and cache results #74

Merged
merged 2 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions gliderpy/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

"""

import functools
from copy import copy
from typing import Optional

import httpx
Expand All @@ -21,6 +23,23 @@
_server = "https://gliders.ioos.us/erddap"


@functools.lru_cache(maxsize=128)
def _to_pandas_multiple(glider_grab):
"""Thin wrapper to cache the results when multiple datasets are requested."""
df_all = {}
glider_grab_copy = copy(glider_grab)
for dataset_id in glider_grab_copy.datasets["Dataset ID"]:
glider_grab_copy.fetcher.dataset_id = dataset_id
df = glider_grab_copy.fetcher.to_pandas(
index_col="time (UTC)",
parse_dates=True,
)
dataset_url = glider_grab_copy.fetcher.get_download_url().split("?")[0]
df = standardise_df(df, dataset_url)
df_all.update({dataset_id: df})
return df_all


def standardise_df(df, dataset_url):
"""
Standardise variable names in a dataset and add column for url
Expand Down Expand Up @@ -57,25 +76,18 @@ def to_pandas(self):
"""
Fetches data from the server and reads into a pandas dataframe

:return: pandas dataframe with datetime UTC as index
:return: pandas dataframe with datetime UTC as index, multiple dataset_ids dataframes are stored in a dictionary
"""
if self.fetcher.dataset_id:
df = self.fetcher.to_pandas(
index_col="time (UTC)",
parse_dates=True,
)
elif not self.fetcher.dataset_id and self.datasets is not None:
df_all = []
for dataset_id in self.datasets["Dataset ID"]:
self.fetcher.dataset_id = dataset_id
df = self.fetcher.to_pandas(
index_col="time (UTC)",
parse_dates=True,
)
dataset_url = self.fetcher.get_download_url().split("?")[0]
df = standardise_df(df, dataset_url)
df_all.append(df)
return pd.concat(df_all)
df_all = _to_pandas_multiple(self)
# We need to reset to avoid fetching a single dataset_id when making multiple requests.
self.fetcher.dataset_id = None
return df_all
else:
raise ValueError(
f"Must provide a {self.fetcher.dataset_id} or `query` terms to download data.",
Expand Down
13 changes: 11 additions & 2 deletions notebooks/00-quick_intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,17 @@
"metadata": {},
"outputs": [],
"source": [
"df = glider_grab.to_pandas()\n",
"df.head()"
"datasets = glider_grab.to_pandas()\n",
"datasets.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datasets[\"ru23-20121025T1944\"].head()"
]
},
{
Expand Down