Fix some mypy errors

statisticsnorway · May 7, 2024 · 3284df2 · 3284df2
1 parent 8661f1c
commit 3284df2
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 26 deletions.
diff --git a/noxfile.py b/noxfile.py
@@ -142,7 +142,7 @@ def precommit(session: Session) -> None:
 @session(python=python_versions)
 def mypy(session: Session) -> None:
     """Type-check using mypy."""
-    args = session.posargs or ["src", "tests"]
+    args = session.posargs or ["src", "tests", "click"]
     session.install(".")
     session.install("mypy", "pytest")
     session.run("mypy", *args)

diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ warn_unreachable = true
 pretty = true
 show_column_numbers = true
 show_error_context = true
+disallow_any_generics = false
 
 [tool.ruff]
 force-exclude = true  # Apply excludes to pre-commit

diff --git a/src/ssb_timeseries/dataset.py b/src/ssb_timeseries/dataset.py
@@ -6,7 +6,7 @@
 from typing import no_type_check
 
 import numpy as np
-import pandas as pd  # type: ignore[import-untyped]
+import pandas as pd
 from typing_extensions import Self
 
 from ssb_timeseries import io
@@ -367,11 +367,14 @@ def __getitem__(
         """Access Dataset.data.columns via Dataset[ list[column_names] | pattern | tags].
 
         Arguments:
-            criteria: (str | dict) Either a string pattern or a dict of tags.
+            criteria:  Either a string pattern or a dict of tags.
             kwargs: If criteria is empty, this is passed to filter().
 
         Returns:
             Self | None
+
+        Raises:
+            TypeError: If filter() returns another type than Dataset.
         """
         # pattern: str = "", regex: str = "", tags: dict = {}):
         # Dataset[...] should return a Dataset object (?) with only the requested items (columns).
@@ -384,14 +387,18 @@ def __getitem__(
         # Or, is there a trick using dataframe views?
         # --->
         if criteria and isinstance(criteria, str):
-            return self.filter(pattern=criteria)
+            result = self.filter(pattern=criteria)
         elif criteria and isinstance(criteria, dict):
-            return self.filter(tags=criteria)
+            result = self.filter(tags=criteria)
         elif kwargs:
             ts_logger.debug(f"DATASET.__getitem__(:\n\t{kwargs} ")
-            return self.filter(**kwargs)
+            result = self.filter(**kwargs)
         else:
             return None
+        if isinstance(result, Dataset):
+            return result  # type: ignore[return-value]
+        else:
+            raise TypeError("Dataset.filter() did not return a Dataset type.")
 
     def plot(self, *args: Any, **kwargs: Any) -> Any:
         """Plot dataset data.
@@ -400,7 +407,7 @@ def plot(self, *args: Any, **kwargs: Any) -> Any:
         """
         xlabels = self.datetime_columns()[0]
         ts_logger.debug(f"Dataset.plot({args!r}, {kwargs!r}) x-labels {xlabels}")
-        return self.data.plot(
+        return self.data.plot(  # type: ignore[call-overload]
             xlabels,
             *args,
             legend=len(self.data.columns) < 9,
@@ -452,28 +459,33 @@ def groupby(
         period_index = pd.PeriodIndex(self.data[datetime_columns[0]], freq=freq)
         ts_logger.debug(f"DATASET {self.name}: period index\n{period_index}.")
 
+        # Fix for case when **kwargs contains numeric_only
+        if "numeric_only" in kwargs:
+            kwargs.pop("numeric_only")
+        numeric_only_value = True
+
         match func:
             case "mean":
-                out = self.data.groupby(period_index).mean(
-                    *args, numeric_only=True, **kwargs
+                out = self.data.groupby(period_index).mean(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
             case "sum":
-                out = self.data.groupby(period_index).sum(
-                    *args, numeric_only=True, **kwargs
+                out = self.data.groupby(period_index).sum(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
             case "auto":
                 # TODO: QA on exact logic / use "real" metadata
                 # in particular, how to check meta data and blend d1 and df2 values as appropriate
                 # (this implementation is just to show how it can be done)
                 # QUESTION: do we need a default for "other" series / what should it be?
-                df1 = self.data.groupby(period_index).mean(
-                    *args, numeric_only=True, **kwargs
+                df1 = self.data.groupby(period_index).mean(  # type: ignore[misc]
+                    *args, numeric_only=numeric_only_value, **kwargs
                 )
                 ts_logger.debug(f"groupby\n{df1}.")
 
                 df2 = (
                     self.data.groupby(period_index)
-                    .sum(*args, numeric_only=True, **kwargs)
+                    .sum(*args, numeric_only=numeric_only_value, **kwargs)  # type: ignore[misc]
                     .filter(regex="mendgde|volum|vekt")
                 )
                 ts_logger.warning(f"groupby\n{df2}.")

diff --git a/src/ssb_timeseries/meta.py b/src/ssb_timeseries/meta.py
@@ -3,6 +3,8 @@
 Ideally, this functionality should live elsewhere, in ssb-python-klass and other meta data libraries. Likely subject to refactoring later.
 """
 
+import io
+
 import bigtree
 import pandas as pd
 from klass import get_classification
@@ -72,7 +74,7 @@ def __init__(
         self.definition = {"name": root_name}
         if isinstance(id_or_path, int):
             # TO DO: handle versions of KLASS
-            klass = get_classification(id_or_path).get_codes().data
+            klass = get_classification(str(id_or_path)).get_codes().data
             self.entities = add_root_node(
                 klass, {"code": "0", "parentCode": None, "name": root_name}
             )
@@ -116,9 +118,11 @@ def __eq__(self, other: Self) -> bool:
         o_entities = other.entities[fields_to_compare].reset_index(drop=True)
 
         ts_logger.debug(
-            f"comparing:\n{s_entities.to_string()}\n...and:\n{s_entities.to_string()}"
+            f"comparing:\n{s_entities.to_string()}\n...and:\n{o_entities.to_string()}"
+        )
+        ts_logger.debug(
+            f".info:\n{_df_info_as_string(s_entities)}\n...and:\n{_df_info_as_string(o_entities)}"
         )
-        ts_logger.debug(f".info:\n{s_entities.info()}\n...and:\n{s_entities.info()}")
         entities_equal = all(s_entities == o_entities)
 
         return trees_equal and entities_equal
@@ -146,15 +150,15 @@ def save(self, path: PathStr) -> None:
         self.entities.to_json(path_or_buf=path)
 
 
-def add_root_node(df: pd.DataFrame, root_node: dict) -> pd.DataFrame:
+def add_root_node(df: pd.DataFrame, root_node: dict[str, str | None]) -> pd.DataFrame:
     """Prepend root node row to taxonomy dataframe."""
-    new_row = dict((c, None) for c in df.columns)
-    for k in root_node.keys():
+    new_row = {c: None for c in df.columns}
+    for k in root_node:
         new_row[k] = root_node[k]
     df.rename(columns={"name": "fullName"})
     df["parentCode"] = df["parentCode"].fillna(value=root_node["code"])
-    df.loc[-1] = root_node
-    df.index = df.index + 1
+    root_df = pd.DataFrame(root_node, index=[0])
+    df = pd.concat([root_df, df], ignore_index=True)
     df.sort_index(inplace=True)
     return df
 
@@ -275,3 +279,10 @@ def to_str(self, attributes: list(str) = None, separator: str = "_") -> list[str
     def __repr__(self) -> str:
         """Return initialization for a copy of the series tag object: SeriesTags(name={self.name}, versioning={self.versioning}, temporality={self.temporality}, tags={self.tags})."""
         return f"SeriesTags(name={self.name}, versioning={self.versioning}, temporality={self.temporality}, tags={self.tags})"
+
+
+def _df_info_as_string(df: pd.DataFrame) -> str:
+    """Returns the content of df.info() as a string."""
+    with io.StringIO() as buffer:
+        df.info(buf=buffer)
+        return buffer.getvalue()
diff --git a/src/ssb_timeseries/sample_data.py b/src/ssb_timeseries/sample_data.py
@@ -114,7 +114,7 @@ def create_df(
         "T": "minutes",
         "S": "seconds",
     }
-    valid_to = valid_from + pd.DateOffset(**{freq_lookup[freq]: interval})
+    valid_to = valid_from + pd.DateOffset(**{freq_lookup[freq]: interval})  # type: ignore
 
     # BUGFIX: If *lists receives strings, permutations will be over chars by chars
     # Kombiner listene til en enkelt liste av lister

diff --git a/tests/test_meta_tagging.py b/tests/test_meta_tagging.py
@@ -148,7 +148,7 @@ def test_find_data_using_single_metadata_attribute(
     expected_matches = ["a_p_z", "a_q_z", "a_r_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)
@@ -196,7 +196,7 @@ def test_find_data_using_multiple_metadata_attributes(
     expected_matches = ["a_q_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)
@@ -245,7 +245,7 @@ def test_find_data_using_metadata_criteria_with_single_attribute_and_multiple_va
     expected_matches = ["a_p_z", "a_q_z", "a_r_z", "b_p_z", "b_q_z", "b_r_z"]
 
     ts_logger.debug(
-        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"
+        f"x_attr_A_equals_a: \n\t{x_attr_A_equals_a.series()}\n vs expected:\n\t{expected_matches}"  # type: ignore
     )
     assert isinstance(x_attr_A_equals_a, Dataset)
     assert sorted(x_attr_A_equals_a.numeric_columns()) == sorted(expected_matches)