Skip to content

Commit

Permalink
Merge pull request #91 from ocefpaf/ruff_all
Browse files Browse the repository at this point in the history
Enable all lints in ruff
  • Loading branch information
ocefpaf authored Mar 6, 2024
2 parents 399041c + c47db32 commit 38498f2
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 134 deletions.
28 changes: 20 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@ repos:
- id: file-contents-sorter
files: requirements-dev.txt

- repo: https://github.com/psf/black
rev: 24.1.1
hooks:
- id: black
language_version: python3

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
hooks:
Expand All @@ -33,7 +27,7 @@ repos:
- id: blackdoc

- repo: https://github.com/econchick/interrogate
rev: 1.5.0
rev: 237be78f9c6135fc1a620d211cdfdc5d3885082b
hooks:
- id: interrogate
exclude: ^(docs|tests)
Expand All @@ -56,9 +50,27 @@ repos:
- id: add-trailing-comma

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.0
rev: v0.3.0
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format

- repo: https://github.com/nbQA-dev/nbQA
rev: 1.7.1
hooks:
- id: nbqa-check-ast
- id: nbqa-black
- id: nbqa-ruff
args: [
--fix,
--config=ruff.toml,
]

- repo: https://github.com/bdice/nb-strip-paths
rev: v0.1.0
hooks:
- id: nb-strip-paths

- repo: https://github.com/tox-dev/pyproject-fmt
rev: 1.7.0
Expand Down
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ prune *.egg-info

global-exclude *.nc

exclude *.yml
exclude *.enc
exclude *.yml
exclude .gitignore
exclude .isort.cfg
exclude gliderpy/_version.py
exclude ruff.toml
10 changes: 5 additions & 5 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@
# The master toctree document.
master_doc = "index"

from datetime import date
import datetime

year = date.today().year
year = datetime.datetime.now(tz=datetime.timezone.utc).date().year

# General information about the project.
project = "gliderpy"
Expand All @@ -62,11 +62,11 @@
# |version| and |release|, also used in various other places throughout the
# built documents.
#
from gliderpy import __version__ as VERSION # noqa
from gliderpy import __version__

version = VERSION
version = __version__
# The full version, including alpha/beta/rc tags.
release = VERSION
release = __version__

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
168 changes: 100 additions & 68 deletions gliderpy/fetchers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""
Helper methods to fetch glider data from multiple ERDDAP serves
"""
"""Helper methods to fetch glider data from multiple ERDDAP serves."""

import datetime
import functools
from copy import copy
from typing import Optional
from numbers import Number

import httpx
import pandas as pd
Expand All @@ -17,96 +15,110 @@
server_vars,
)

OptionalStr = Optional[str]
OptionalBool = bool | None
OptionalDict = dict | None
OptionalList = list[str] | tuple[str] | None
OptionalStr = str | None
OptionalNum = Number | None
# Should we add more or datetime.datetime catches all?
OptionalDateTime = datetime.datetime | str

# Defaults to the IOOS glider DAC.
_server = "https://gliders.ioos.us/erddap"


@functools.lru_cache(maxsize=128)
def _to_pandas_multiple(glider_grab):
"""Thin wrapper to cache the results when multiple datasets are requested."""
def _to_pandas_multiple(glider_grab: "GliderDataFetcher") -> pd.DataFrame:
"""Thin wrapper to cache results when multiple datasets are requested."""
df_all = {}
glider_grab_copy = copy(glider_grab)
for dataset_id in glider_grab_copy.datasets["Dataset ID"]:
glider_grab_copy.fetcher.dataset_id = dataset_id
df = glider_grab_copy.fetcher.to_pandas()
glider_df = glider_grab_copy.fetcher.to_pandas()
dataset_url = glider_grab_copy.fetcher.get_download_url().split("?")[0]
df = standardise_df(df, dataset_url)
df_all.update({dataset_id: df})
glider_df = standardise_df(glider_df, dataset_url)
df_all.update({dataset_id: glider_df})
return df_all


def standardise_df(df, dataset_url):
"""
Standardise variable names in a dataset and add column for url
"""
df.columns = df.columns.str.lower()
df = df.set_index("time (utc)")
df = df.rename(columns=server_parameter_rename)
df.index = pd.to_datetime(df.index)
# We need to sort b/c of the non-sequential submission of files due to the nature of glider data transmission.
df = df.sort_index()
df["dataset_url"] = dataset_url
return df
def standardise_df(glider_df: pd.DataFrame, dataset_url: str) -> pd.DataFrame:
"""Standardise variable names in a dataset and add column for URL."""
glider_df.columns = glider_df.columns.str.lower()
glider_df = glider_df.set_index("time (utc)")
glider_df = glider_df.rename(columns=server_parameter_rename)
glider_df.index = pd.to_datetime(glider_df.index)
# We need to sort b/c of the non-sequential submission of files due to
# the nature of glider data transmission.
glider_df = glider_df.sort_index()
glider_df["dataset_url"] = dataset_url
return glider_df


class GliderDataFetcher:
"""
"""Instantiate the glider fetcher.
Args:
----
server: A glider ERDDAP server URL.
Attributes:
----------
dataset_id: A dataset unique id.
constraints: Download constraints, defaults same as query.
"""

def __init__(self, server=_server):
def __init__(
self: "GliderDataFetcher",
server: OptionalStr = _server,
) -> None:
"""Instantiate main class attributes."""
self.server = server
self.fetcher = ERDDAP(
server=server,
protocol="tabledap",
)
self.fetcher.variables = server_vars[server]
self.fetcher.dataset_id: OptionalStr = None
self.datasets: Optional = None
self.datasets: OptionalBool = None

def to_pandas(self):
"""
Fetches data from the server and reads into a pandas dataframe
def to_pandas(self: "GliderDataFetcher") -> pd.DataFrame:
"""Return data from the server as a pandas dataframe.
:return: pandas dataframe with datetime UTC as index, multiple dataset_ids dataframes are stored in a dictionary
:return: pandas a dataframe with datetime UTC as index,
multiple dataset_ids dataframes are stored in a dictionary
"""
if self.fetcher.dataset_id:
df = self.fetcher.to_pandas()
glider_df = self.fetcher.to_pandas()
elif not self.fetcher.dataset_id and self.datasets is not None:
df_all = _to_pandas_multiple(self)
# We need to reset to avoid fetching a single dataset_id when making multiple requests.
glider_df = _to_pandas_multiple(self)
# We need to reset to avoid fetching a single dataset_id when
# making multiple requests.
self.fetcher.dataset_id = None
return df_all
return glider_df
else:
raise ValueError(
f"Must provide a {self.fetcher.dataset_id} or `query` terms to download data.",
msg = (
f"Must provide a {self.fetcher.dataset_id} or "
"`query` terms to download data."
)
raise ValueError(msg)

# Standardize variable names for the single dataset_id.
dataset_url = self.fetcher.get_download_url().split("?")[0]
df = standardise_df(df, dataset_url)
return df

def query(
self,
min_lat=None,
max_lat=None,
min_lon=None,
max_lon=None,
min_time=None,
max_time=None,
delayed=False,
):
"""
Takes user supplied geographical and time constraints and adds them to the query
return standardise_df(glider_df, dataset_url)

def query( # noqa: PLR0913
self: "GliderDataFetcher",
*,
min_lat: OptionalNum = None,
max_lat: OptionalNum = None,
min_lon: OptionalNum = None,
max_lon: OptionalNum = None,
min_time: OptionalDateTime = None,
max_time: OptionalDateTime = None,
delayed: OptionalBool = False,
) -> pd.DataFrame:
"""Add user supplied geographical and time constraints to the query.
:param min_lat: southernmost lat
:param max_lat: northermost lat
Expand All @@ -116,7 +128,8 @@ def query(
:param max_time: end time, can be datetime object or string
:return: search query with argument constraints applied
"""
# FIXME: The time constrain could be better implemented by just dropping it instead.
# NB: The time constrain could be better implemented by just
# dropping it instead.
min_time = min_time if min_time else "1970-01-01"
max_time = max_time if max_time else "2038-01-19"
min_lat = min_lat if min_lat else -90.0
Expand Down Expand Up @@ -147,45 +160,64 @@ def query(
try:
data = urlopen(url)
except httpx.HTTPError as err:
raise Exception(
f"Error, no datasets found in supplied range. Try relaxing your constraints: {self.fetcher.constraints}",
) from err
return None
df = pd.read_csv(data)[["Title", "Institution", "Dataset ID"]]
msg = (
"Error, no datasets found in supplied range. "
f"Try relaxing the constraints: {self.fetcher.constraints}"
)
err.message = f"{err.message}\n{msg}"
raise

cols = ["Title", "Institution", "Dataset ID"]
datasets = pd.read_csv(data)[cols]
if not delayed:
df = df.loc[~df["Dataset ID"].str.endswith("delayed")]
datasets = datasets.loc[
~datasets["Dataset ID"].str.endswith("delayed")
]
info_urls = [
self.fetcher.get_info_url(dataset_id=dataset_id, response="html")
for dataset_id in df["Dataset ID"]
self.fetcher.get_info_url(
dataset_id=dataset_id,
response="html",
)
for dataset_id in datasets["Dataset ID"]
]
df["info_url"] = info_urls
self.datasets = df
datasets["info_url"] = info_urls
self.datasets = datasets
return self.datasets


class DatasetList:
"""Build a glider dataset ids list.
Attributes:
Attributes
----------
e: an ERDDAP server instance
TODO: search_terms: A list of terms to search the server for. Multiple terms will be combined as "AND."
TODO -> search_terms: A list of terms to search the server for.
Multiple terms will be combined as "AND."
"""

def __init__(self, server=_server):
def __init__(self: "DatasetList", server: OptionalStr = _server) -> None:
"""Instantiate main class attributes.
Attributes
----------
server: the server URL.
protocol: ERDDAP's protocol (tabledap/griddap)
"""
self.e = ERDDAP(
server=server,
protocol="tabledap",
)

def get_ids(self):
def get_ids(self: "DatasetList") -> list:
"""Return the allDatasets list for the glider server."""
if self.e.server == "https://gliders.ioos.us/erddap":
self.e.dataset_id = "allDatasets"
dataset_ids = self.e.to_pandas()["datasetID"].to_list()
dataset_ids.remove("allDatasets")
self.dataset_ids = dataset_ids
return self.dataset_ids
else:
raise ValueError(f"The {self.e.server} does not supported this operation.")
msg = f"The {self.e.server} does not supported this operation."
raise ValueError(msg)
Loading

0 comments on commit 38498f2

Please sign in to comment.