Merge pull request #47 from datasig-ac-uk/develop

Bump to 0.2.0
datasig-ac-uk · Aug 16, 2023 · f88c81e · f88c81e
2 parents 73ff51e + eb56b34
commit f88c81e
Show file tree

Hide file tree

Showing 12 changed files with 1,847 additions and 461 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -85,7 +85,7 @@ jobs:
       - name: Check products
         run: pipx run twine check dist/*
 
-      - uses: pypa/[email protected].7
+      - uses: pypa/[email protected].10
         if: github.event_name == 'release' && github.event.action == 'published'
         with:
           # Remember to generate this and set it in "GitHub Secrets"

diff --git a/src/nlpsig/__init__.py b/src/nlpsig/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 
 import warnings
 

diff --git a/src/nlpsig/classification_utils.py b/src/nlpsig/classification_utils.py
@@ -4,6 +4,7 @@
 import random
 from typing import Iterable
 
+import numpy as np
 import torch
 from sklearn.model_selection import (
     GroupKFold,
@@ -21,11 +22,11 @@ class DataSplits:
 
     Parameters
     ----------
-    x_data : torch.Tensor
+    x_data : np.array | torch.Tensor
         Features for prediction.
-    y_data : torch.Tensor
+    y_data : np.array | torch.Tensor
         Variable to predict.
-    groups : torch.Tensor | None, optional
+    groups : np.array | torch.Tensor | None, optional
         Groups to split by, default None.
         If groups are passed, then GroupShuffleSplit is used to create a dataset
         split where groups are fit entirely within a datasplit, i.e.
@@ -53,9 +54,9 @@ class DataSplits:
 
     def __init__(
         self,
-        x_data: torch.Tensor,
-        y_data: torch.Tensor,
-        groups: torch.Tensor | None = None,
+        x_data: np.array | torch.Tensor,
+        y_data: np.array | torch.Tensor,
+        groups: np.array | torch.Tensor | None = None,
         train_size: float = 0.8,
         valid_size: float | None = 0.33,
         indices: tuple[Iterable[int], Iterable[int], Iterable[int]] | None = None,
@@ -65,14 +66,20 @@ def __init__(
         if x_data.shape[0] != y_data.shape[0]:
             msg = (
                 "x_data and y_data do not have compatible shapes "
-                "(need to have same number of samples)"
+                "(need to have same number of samples)."
+            )
+            raise ValueError(msg)
+        if groups is not None and x_data.shape[0] != len(groups):
+            msg = (
+                "x_data and groups do not have compatible shapes "
+                "(need to have same number of samples)."
             )
             raise ValueError(msg)
         if (train_size < 0) or (train_size > 1):
-            msg = "train_size must be between 0 and 1"
+            msg = "train_size must be between 0 and 1."
             raise ValueError(msg)
         if valid_size is not None and ((valid_size < 0) or (valid_size > 1)):
-            msg = "valid_size must be between 0 and 1"
+            msg = "valid_size must be between 0 and 1."
             raise ValueError(msg)
 
         self.x_data = x_data
@@ -86,7 +93,7 @@ def __init__(
             self.random_state = None
 
             # indices are provided, so use these to split the dataset
-            msg = "if indices are provided, it must be a tuple of length 3"
+            msg = "if indices are provided, it must be a tuple of length 3."
             if not isinstance(indices, tuple):
                 raise TypeError(msg)
             if len(indices) != 3:
@@ -100,7 +107,7 @@ def __init__(
                     problem_set = "train" if i == 0 else "valid" if i == 1 else "test"
                     msg = (
                         f"in the {problem_set} indices, "
-                        "some of the indices will be out of range"
+                        "some of the indices will be out of range."
                     )
                     raise IndexError(msg)
 
@@ -109,16 +116,8 @@ def __init__(
             test_index = indices[2]
         else:
             if self.groups is not None:
-                # see https://github.com/scikit-learn/scikit-learn/issues/9193
-                print("[INFO] Splitting data by provided groups")
-                self.shuffle = False
-
-                if x_data.shape[0] != len(self.groups):
-                    msg = (
-                        "x_data and groups do not have compatible shapes "
-                        "(need to have same number of samples)"
-                    )
-                    raise ValueError(msg)
+                # see https:/github.com/scikit-learn/scikit-learn/issues/9193
+                self.shuffle = True
 
                 # first split data into train set, test/valid set by group
                 gss = GroupShuffleSplit(
@@ -180,6 +179,16 @@ def __init__(
 
     def get_splits(
         self, as_DataLoader: bool = False, data_loader_args: dict | None = None
+    ) -> (
+        tuple[DataLoader, DataLoader, DataLoader]
+        | tuple[
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+        ]
     ):
         """
         Returns train, validation and test set.
@@ -196,13 +205,13 @@ def get_splits(
 
         Returns
         -------
-        tuple[DataLoader] | tuple[torch.Tensor]
+        tuple[DataLoader] | tuple[np.array | torch.Tensor]
             If `as_DataLoader` is True, return tuple of `torch.utils.data.dataloader.DataLoader` objects:
                 - First element is training dataset
                 - Second element is validation dataset
                 - Third element is testing dataset
 
-            If `as_DataLoader` is False, returns tuple of `torch.Tensors`:
+            If `as_DataLoader` is False, returns tuple of either `numpy.array`s or `torch.Tensor`s:
                 - First element is features for training dataset
                 - Second element is labels for training dataset
                 - Third element is features for validation dataset
@@ -233,11 +242,27 @@ def get_splits(
         if as_DataLoader:
             # return datasets as DataLoader objects if requested
             if x_valid is not None:
+                # make sure that x_valid and y_valid are torch tensors
+                if isinstance(x_valid, np.ndarray):
+                    x_valid = torch.from_numpy(x_valid)
+                if isinstance(y_valid, np.ndarray):
+                    y_valid = torch.from_numpy(y_valid)
+
                 valid = TensorDataset(x_valid, y_valid)
                 valid_loader = DataLoader(dataset=valid, **data_loader_args)
             else:
                 valid_loader = None
 
+            # make sure that x_train, y_train, x_test and y_test are torch tensors
+            if isinstance(x_train, np.ndarray):
+                x_train = torch.from_numpy(x_train)
+            if isinstance(y_train, np.ndarray):
+                y_train = torch.from_numpy(y_train)
+            if isinstance(x_test, np.ndarray):
+                x_test = torch.from_numpy(x_test)
+            if isinstance(y_test, np.ndarray):
+                y_test = torch.from_numpy(y_test)
+
             train = TensorDataset(x_train, y_train)
             test = TensorDataset(x_test, y_test)
             train_loader = DataLoader(dataset=train, **data_loader_args)
@@ -261,11 +286,11 @@ class Folds:
 
     Parameters
     ----------
-    x_data : torch.Tensor
+    x_data : np.array | torch.Tensor
         Features for prediction.
-    y_data : torch.Tensor
+    y_data : np.array | torch.Tensor
         Variable to predict.
-    groups : torch.Tensor | None, optional
+    groups : np.array | torch.Tensor | None, optional
         Groups to split by, default None. If None is passed, then does standard KFold,
         otherwise implements GroupShuffleSplit (if shuffle is True),
         or GroupKFold (if shuffle is False).
@@ -303,9 +328,9 @@ class Folds:
 
     def __init__(
         self,
-        x_data: torch.Tensor,
-        y_data: torch.Tensor,
-        groups: torch.Tensor | None = None,
+        x_data: np.array | torch.Tensor,
+        y_data: np.array | torch.Tensor,
+        groups: np.array | torch.Tensor | None = None,
         n_splits: int = 5,
         valid_size: float | None = 0.33,
         indices: tuple[tuple[Iterable[int], Iterable[int] | None, Iterable[int]]]
@@ -314,22 +339,22 @@ def __init__(
         random_state: int = 42,
     ):
         if n_splits < 2:
-            msg = "n_splits should be at least 2"
+            msg = "n_splits should be at least 2."
             raise ValueError(msg)
         if x_data.shape[0] != y_data.shape[0]:
             msg = (
                 "x_data and y_data do not have compatible shapes "
-                "(need to have same number of samples)"
+                "(need to have same number of samples)."
             )
             raise ValueError(msg)
         if groups is not None and x_data.shape[0] != len(groups):
             msg = (
                 "x_data and groups do not have compatible shapes "
-                "(need to have same number of samples)"
+                "(need to have same number of samples)."
             )
             raise ValueError(msg)
         if valid_size is not None and ((valid_size < 0) or (valid_size > 1)):
-            msg = "valid_size must be between 0 and 1"
+            msg = "valid_size must be between 0 and 1."
             raise ValueError(msg)
 
         self.x_data = x_data
@@ -344,15 +369,15 @@ def __init__(
 
             # indices are provided, so use these to split the dataset
             # check that indices are a tuple of length k
-            msg = "if indices are provided, it must be a tuple of length n_splits"
+            msg = "if indices are provided, it must be a tuple of length n_splits."
             if not isinstance(indices, tuple):
                 raise TypeError(msg)
             if len(indices) != self.n_splits:
                 raise ValueError(msg)
 
             for k in range(self.n_splits):
                 # check that indices[k] is a tuple of length 3
-                msg = "each item in indices must be a tuple of length 3"
+                msg = f"each item in indices must be a tuple of length 3: fold {k} is not."
                 if not isinstance(indices[k], tuple):
                     raise TypeError(msg)
                 if len(indices[k]) != 3:
@@ -368,7 +393,7 @@ def __init__(
                         )
                         msg = (
                             f"in the {problem_set} indices, for fold {k}, "
-                            "some of the indices will be out of range"
+                            "some of the indices will be out of range."
                         )
                         raise IndexError(msg)
 
@@ -418,6 +443,8 @@ def __init__(
                 # store indices
                 self.fold_indices[k] = (train_index, valid_index, test_index)
 
+            self.fold_indices = tuple(self.fold_indices)
+
     def get_splits(
         self,
         fold_index: int,
@@ -426,12 +453,12 @@ def get_splits(
     ) -> (
         tuple[DataLoader, DataLoader, DataLoader]
         | tuple[
-            torch.Tensor,
-            torch.Tensor,
-            torch.Tensor,
-            torch.Tensor,
-            torch.Tensor,
-            torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
+            np.array | torch.Tensor,
         ]
     ):
         """
@@ -451,13 +478,13 @@ def get_splits(
 
         Returns
         -------
-        tuple[DataLoader] | tuple[torch.Tensor]
+        tuple[DataLoader] | tuple[np.array | torch.Tensor]
             If `as_DataLoader` is True, return tuple of `torch.utils.data.dataloader.DataLoader` objects:
                 - First element is training dataset
                 - Second element is validation dataset
                 - Third element is testing dataset
 
-            If `as_DataLoader` is False, returns tuple of `torch.Tensors`:
+            If `as_DataLoader` is False, returns tuple of either `numpy.array`s or `torch.Tensor`s:
                 - First element is features for training dataset
                 - Second element is labels for training dataset
                 - Third element is features for validation dataset
@@ -503,11 +530,27 @@ def get_splits(
         if as_DataLoader:
             # return datasets as DataLoader objects if requested
             if valid_index is not None:
+                # make sure that x_valid and y_valid are torch tensors
+                if isinstance(x_valid, np.ndarray):
+                    x_valid = torch.from_numpy(x_valid)
+                if isinstance(y_valid, np.ndarray):
+                    y_valid = torch.from_numpy(y_valid)
+
                 valid = TensorDataset(x_valid, y_valid)
                 valid_loader = DataLoader(dataset=valid, **data_loader_args)
             else:
                 valid_loader = None
 
+            # make sure that x_train, y_train, x_test and y_test are torch tensors
+            if isinstance(x_train, np.ndarray):
+                x_train = torch.from_numpy(x_train)
+            if isinstance(y_train, np.ndarray):
+                y_train = torch.from_numpy(y_train)
+            if isinstance(x_test, np.ndarray):
+                x_test = torch.from_numpy(x_test)
+            if isinstance(y_test, np.ndarray):
+                y_test = torch.from_numpy(y_test)
+
             train = TensorDataset(x_train, y_train)
             test = TensorDataset(x_test, y_test)
             train_loader = DataLoader(dataset=train, **data_loader_args)