statisticsnorway · bryeng · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/noxfile.py b/noxfile.py
@@ -144,7 +144,7 @@ def mypy(session: Session) -> None:
     """Type-check using mypy."""
     args = session.posargs or ["src", "tests"]
     session.install(".")
-    session.install("mypy", "pytest")
+    session.install("mypy", "pytest", "click")
     session.run("mypy", *args)
     if not session.posargs:
         session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py")
@@ -154,7 +154,7 @@ def mypy(session: Session) -> None:
 def tests(session: Session) -> None:
     """Run the test suite."""
     session.install(".")
-    session.install("coverage[toml]", "pytest", "pygments")
+    session.install("coverage[toml]", "pytest", "pygments", "click")
     try:
         session.run(
             "coverage",
@@ -188,7 +188,7 @@ def coverage(session: Session) -> None:
 def typeguard(session: Session) -> None:
     """Runtime type checking using Typeguard."""
     session.install(".")
-    session.install("pytest", "typeguard", "pygments")
+    session.install("pytest", "typeguard", "pygments", "click")
     session.run("pytest", f"--typeguard-packages={package}", *session.posargs)
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,19 +14,16 @@ classifiers = ["Development Status :: 4 - Beta"]
 Changelog = "https://github.com/statisticsnorway/ssb-timeseries/releases"
 
 [tool.poetry.dependencies]
-python = ">=3.10,<3.13"
+python = ">=3.10,<4.0"
 dapla-toolbelt = ">=1.3.2"
-pandas = "^2.1.1"
-pytest = "^7.4.3"
-ssb-klass-python = "^0.0.7"
-pyarrow = "^14.0.0"
-google-cloud-logging = "^3.8.0"
-pytz = "^2023.3.post1"
-polars = "^0.19.18"
-duckdb = "^0.10.0"
-bigtree = "^0.17.0"
-click = "^8.1.7"
-typing-extensions = "^4.11.0"
+pandas = ">=2.1.1"
+ssb-klass-python = ">=0.0.7"
+pyarrow = ">=14.0.0"
+pytz = ">=2023.3.post1"
+bigtree = ">=0.17.0"
+typing-extensions = ">=4.11.0"
+python-dateutil = ">=2.9.0.post0"
+numpy = ">=1.26.4"
 
 [tool.poetry.group.dev.dependencies]
 pygments = ">=2.10.0"
@@ -46,6 +43,8 @@ sphinx-click = ">=3.0.2"
 typeguard = ">=2.13.3"
 xdoctest = { extras = ["colors"], version = ">=0.15.10" }
 myst-parser = { version = ">=0.16.1" }
+deptry = ">=0.16.1"
+click = ">=8.1.7"
 
 [tool.pytest.ini_options]
 pythonpath = ["src"]
@@ -73,6 +72,7 @@ warn_unreachable = true
 pretty = true
 show_column_numbers = true
 show_error_context = true
+disallow_any_generics = false
 
 [tool.ruff]
 force-exclude = true  # Apply excludes to pre-commit
@@ -141,6 +141,9 @@ classmethod-decorators = ["classmethod", "validator", "root_validator", "pydanti
     "S101",    # asserts are encouraged in pytest
 ]
 
+[tool.deptry.per_rule_ignores]
+DEP001 = ["ssb_timeseries", "nox", "nox_poetry"]  # packages available by default
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/src/ssb_timeseries/dataset.py b/src/ssb_timeseries/dataset.py
@@ -6,7 +6,7 @@
 from typing import no_type_check
 
 import numpy as np
-import pandas as pd  # type: ignore[import-untyped]
+import pandas as pd
 from typing_extensions import Self
 
 from ssb_timeseries import io
@@ -367,11 +367,14 @@ def __getitem__(
         """Access Dataset.data.columns via Dataset[ list[column_names] | pattern | tags].
 
         Arguments:
-            criteria: (str | dict) Either a string pattern or a dict of tags.
+            criteria:  Either a string pattern or a dict of tags.
             kwargs: If criteria is empty, this is passed to filter().
 
         Returns:
             Self | None
+
+        Raises:
+            TypeError: If filter() returns another type than Dataset.
         """
         # pattern: str = "", regex: str = "", tags: dict = {}):
         # Dataset[...] should return a Dataset object (?) with only the requested items (columns).
@@ -384,14 +387,18 @@ def __getitem__(
         # Or, is there a trick using dataframe views?
         # --->
         if criteria and isinstance(criteria, str):
-            return self.filter(pattern=criteria)
+            result = self.filter(pattern=criteria)
         elif criteria and isinstance(criteria, dict):
-            return self.filter(tags=criteria)
+            result = self.filter(tags=criteria)
         elif kwargs:
             ts_logger.debug(f"DATASET.__getitem__(:\n\t{kwargs} ")
-            return self.filter(**kwargs)
+            result = self.filter(**kwargs)
         else:
             return None
+        if isinstance(result, Dataset):
+            return result  # type: ignore[return-value]
+        else:
+            raise TypeError("Dataset.filter() did not return a Dataset type.")
 
     def plot(self, *args: Any, **kwargs: Any) -> Any:
         """Plot dataset data.
@@ -400,7 +407,7 @@ def plot(self, *args: Any, **kwargs: Any) -> Any:
         """
         xlabels = self.datetime_columns()[0]
         ts_logger.debug(f"Dataset.plot({args!r}, {kwargs!r}) x-labels {xlabels}")
-        return self.data.plot(
+        return self.data.plot(  # type: ignore[call-overload]
             xlabels,
             *args,
             legend=len(self.data.columns) < 9,
@@ -452,28 +459,33 @@ def groupby(
         period_index = pd.PeriodIndex(self.data[datetime_columns[0]], freq=freq)
         ts_logger.debug(f"DATASET {self.name}: period index\n{period_index}.")
 
+        # Fix for case when **kwargs contains numeric_only
+        if "numeric_only" in kwargs:
+            kwargs.pop("numeric_only")
+        numeric_only_value = True
+
         match func:
             case "mean":
-                out = self.data.groupby(period_index).mean(
-                    *args, numeric_only=True, **kwargs
+                out = self.data.groupby(period_index).mean(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
             case "sum":
-                out = self.data.groupby(period_index).sum(
-                    *args, numeric_only=True, **kwargs
+                out = self.data.groupby(period_index).sum(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
             case "auto":
                 # TODO: QA on exact logic / use "real" metadata
                 # in particular, how to check meta data and blend d1 and df2 values as appropriate
                 # (this implementation is just to show how it can be done)
                 # QUESTION: do we need a default for "other" series / what should it be?
-                df1 = self.data.groupby(period_index).mean(
-                    *args, numeric_only=True, **kwargs
+                df1 = self.data.groupby(period_index).mean(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
                 ts_logger.debug(f"groupby\n{df1}.")
 
                 df2 = (
                     self.data.groupby(period_index)
-                    .sum(*args, numeric_only=True, **kwargs)
+                    .sum(*args, numeric_only=numeric_only_value, **kwargs)  # type: ignore[misc]
                     .filter(regex="mendgde|volum|vekt")
                 )
                 ts_logger.warning(f"groupby\n{df2}.")

diff --git a/src/ssb_timeseries/meta.py b/src/ssb_timeseries/meta.py
@@ -3,6 +3,8 @@
 Ideally, this functionality should live elsewhere, in ssb-python-klass and other meta data libraries. Likely subject to refactoring later.
 """
 
+import io
+
 import bigtree
 import pandas as pd
 from klass import get_classification
@@ -72,7 +74,7 @@ def __init__(
         self.definition = {"name": root_name}
         if isinstance(id_or_path, int):
             # TO DO: handle versions of KLASS
-            klass = get_classification(id_or_path).get_codes().data
+            klass = get_classification(str(id_or_path)).get_codes().data
             self.entities = add_root_node(
                 klass, {"code": "0", "parentCode": None, "name": root_name}
             )
@@ -116,9 +118,11 @@ def __eq__(self, other: Self) -> bool:
         o_entities = other.entities[fields_to_compare].reset_index(drop=True)
 
         ts_logger.debug(
-            f"comparing:\n{s_entities.to_string()}\n...and:\n{s_entities.to_string()}"
+            f"comparing:\n{s_entities.to_string()}\n...and:\n{o_entities.to_string()}"
+        )
+        ts_logger.debug(
+            f".info:\n{_df_info_as_string(s_entities)}\n...and:\n{_df_info_as_string(o_entities)}"
         )
-        ts_logger.debug(f".info:\n{s_entities.info()}\n...and:\n{s_entities.info()}")
         entities_equal = all(s_entities == o_entities)
 
         return trees_equal and entities_equal
@@ -146,15 +150,15 @@ def save(self, path: PathStr) -> None:
         self.entities.to_json(path_or_buf=path)
 
 
-def add_root_node(df: pd.DataFrame, root_node: dict) -> pd.DataFrame:
+def add_root_node(df: pd.DataFrame, root_node: dict[str, str | None]) -> pd.DataFrame:
     """Prepend root node row to taxonomy dataframe."""
-    new_row = dict((c, None) for c in df.columns)
-    for k in root_node.keys():
+    new_row = {c: None for c in df.columns}
+    for k in root_node:
         new_row[k] = root_node[k]
     df.rename(columns={"name": "fullName"})
     df["parentCode"] = df["parentCode"].fillna(value=root_node["code"])
-    df.loc[-1] = root_node
-    df.index = df.index + 1
+    root_df = pd.DataFrame(root_node, index=[0])
+    df = pd.concat([root_df, df], ignore_index=True)
     df.sort_index(inplace=True)
     return df
 
@@ -275,3 +279,10 @@ def to_str(self, attributes: list(str) = None, separator: str = "_") -> list[str
     def __repr__(self) -> str:
         """Return initialization for a copy of the series tag object: SeriesTags(name={self.name}, versioning={self.versioning}, temporality={self.temporality}, tags={self.tags})."""
         return f"SeriesTags(name={self.name}, versioning={self.versioning}, temporality={self.temporality}, tags={self.tags})"
+
+
+def _df_info_as_string(df: pd.DataFrame) -> str:
+    """Returns the content of df.info() as a string."""
+    with io.StringIO() as buffer:
+        df.info(buf=buffer)
+        return buffer.getvalue()
diff --git a/src/ssb_timeseries/sample_data.py b/src/ssb_timeseries/sample_data.py
@@ -114,7 +114,7 @@ def create_df(
         "T": "minutes",
         "S": "seconds",
     }
-    valid_to = valid_from + pd.DateOffset(**{freq_lookup[freq]: interval})
+    valid_to = valid_from + pd.DateOffset(**{freq_lookup[freq]: interval})  # type: ignore
 
     # BUGFIX: If *lists receives strings, permutations will be over chars by chars
     # Kombiner listene til en enkelt liste av lister

diff --git a/tests/test_meta_tagging.py b/tests/test_meta_tagging.py
@@ -148,7 +148,7 @@ def test_find_data_using_single_metadata_attribute(
     expected_matches = ["a_p_z", "a_q_z", "a_r_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)
@@ -196,7 +196,7 @@ def test_find_data_using_multiple_metadata_attributes(
     expected_matches = ["a_q_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)
@@ -245,7 +245,7 @@ def test_find_data_using_metadata_criteria_with_single_attribute_and_multiple_va
     expected_matches = ["a_p_z", "a_q_z", "a_r_z", "b_p_z", "b_q_z", "b_r_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)