Skip to content

Commit

Permalink
Merge pull request #47 from datasig-ac-uk/develop
Browse files Browse the repository at this point in the history
Bump to 0.2.0
  • Loading branch information
rchan26 authored Aug 16, 2023
2 parents 73ff51e + eb56b34 commit f88c81e
Show file tree
Hide file tree
Showing 12 changed files with 1,847 additions and 461 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ jobs:
- name: Check products
run: pipx run twine check dist/*

- uses: pypa/[email protected].7
- uses: pypa/[email protected].10
if: github.event_name == 'release' && github.event.action == 'published'
with:
# Remember to generate this and set it in "GitHub Secrets"
Expand Down
2 changes: 1 addition & 1 deletion src/nlpsig/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

__version__ = "0.1.0"
__version__ = "0.2.0"

import warnings

Expand Down
131 changes: 87 additions & 44 deletions src/nlpsig/classification_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import random
from typing import Iterable

import numpy as np
import torch
from sklearn.model_selection import (
GroupKFold,
Expand All @@ -21,11 +22,11 @@ class DataSplits:
Parameters
----------
x_data : torch.Tensor
x_data : np.array | torch.Tensor
Features for prediction.
y_data : torch.Tensor
y_data : np.array | torch.Tensor
Variable to predict.
groups : torch.Tensor | None, optional
groups : np.array | torch.Tensor | None, optional
Groups to split by, default None.
If groups are passed, then GroupShuffleSplit is used to create a dataset
split where groups are fit entirely within a datasplit, i.e.
Expand Down Expand Up @@ -53,9 +54,9 @@ class DataSplits:

def __init__(
self,
x_data: torch.Tensor,
y_data: torch.Tensor,
groups: torch.Tensor | None = None,
x_data: np.array | torch.Tensor,
y_data: np.array | torch.Tensor,
groups: np.array | torch.Tensor | None = None,
train_size: float = 0.8,
valid_size: float | None = 0.33,
indices: tuple[Iterable[int], Iterable[int], Iterable[int]] | None = None,
Expand All @@ -65,14 +66,20 @@ def __init__(
if x_data.shape[0] != y_data.shape[0]:
msg = (
"x_data and y_data do not have compatible shapes "
"(need to have same number of samples)"
"(need to have same number of samples)."
)
raise ValueError(msg)
if groups is not None and x_data.shape[0] != len(groups):
msg = (
"x_data and groups do not have compatible shapes "
"(need to have same number of samples)."
)
raise ValueError(msg)
if (train_size < 0) or (train_size > 1):
msg = "train_size must be between 0 and 1"
msg = "train_size must be between 0 and 1."
raise ValueError(msg)
if valid_size is not None and ((valid_size < 0) or (valid_size > 1)):
msg = "valid_size must be between 0 and 1"
msg = "valid_size must be between 0 and 1."
raise ValueError(msg)

self.x_data = x_data
Expand All @@ -86,7 +93,7 @@ def __init__(
self.random_state = None

# indices are provided, so use these to split the dataset
msg = "if indices are provided, it must be a tuple of length 3"
msg = "if indices are provided, it must be a tuple of length 3."
if not isinstance(indices, tuple):
raise TypeError(msg)
if len(indices) != 3:
Expand All @@ -100,7 +107,7 @@ def __init__(
problem_set = "train" if i == 0 else "valid" if i == 1 else "test"
msg = (
f"in the {problem_set} indices, "
"some of the indices will be out of range"
"some of the indices will be out of range."
)
raise IndexError(msg)

Expand All @@ -109,16 +116,8 @@ def __init__(
test_index = indices[2]
else:
if self.groups is not None:
# see https://github.com/scikit-learn/scikit-learn/issues/9193
print("[INFO] Splitting data by provided groups")
self.shuffle = False

if x_data.shape[0] != len(self.groups):
msg = (
"x_data and groups do not have compatible shapes "
"(need to have same number of samples)"
)
raise ValueError(msg)
# see https:/github.com/scikit-learn/scikit-learn/issues/9193
self.shuffle = True

# first split data into train set, test/valid set by group
gss = GroupShuffleSplit(
Expand Down Expand Up @@ -180,6 +179,16 @@ def __init__(

def get_splits(
self, as_DataLoader: bool = False, data_loader_args: dict | None = None
) -> (
tuple[DataLoader, DataLoader, DataLoader]
| tuple[
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
]
):
"""
Returns train, validation and test set.
Expand All @@ -196,13 +205,13 @@ def get_splits(
Returns
-------
tuple[DataLoader] | tuple[torch.Tensor]
tuple[DataLoader] | tuple[np.array | torch.Tensor]
If `as_DataLoader` is True, return tuple of `torch.utils.data.dataloader.DataLoader` objects:
- First element is training dataset
- Second element is validation dataset
- Third element is testing dataset
If `as_DataLoader` is False, returns tuple of `torch.Tensors`:
If `as_DataLoader` is False, returns tuple of either `numpy.array`s or `torch.Tensor`s:
- First element is features for training dataset
- Second element is labels for training dataset
- Third element is features for validation dataset
Expand Down Expand Up @@ -233,11 +242,27 @@ def get_splits(
if as_DataLoader:
# return datasets as DataLoader objects if requested
if x_valid is not None:
# make sure that x_valid and y_valid are torch tensors
if isinstance(x_valid, np.ndarray):
x_valid = torch.from_numpy(x_valid)
if isinstance(y_valid, np.ndarray):
y_valid = torch.from_numpy(y_valid)

valid = TensorDataset(x_valid, y_valid)
valid_loader = DataLoader(dataset=valid, **data_loader_args)
else:
valid_loader = None

# make sure that x_train, y_train, x_test and y_test are torch tensors
if isinstance(x_train, np.ndarray):
x_train = torch.from_numpy(x_train)
if isinstance(y_train, np.ndarray):
y_train = torch.from_numpy(y_train)
if isinstance(x_test, np.ndarray):
x_test = torch.from_numpy(x_test)
if isinstance(y_test, np.ndarray):
y_test = torch.from_numpy(y_test)

train = TensorDataset(x_train, y_train)
test = TensorDataset(x_test, y_test)
train_loader = DataLoader(dataset=train, **data_loader_args)
Expand All @@ -261,11 +286,11 @@ class Folds:
Parameters
----------
x_data : torch.Tensor
x_data : np.array | torch.Tensor
Features for prediction.
y_data : torch.Tensor
y_data : np.array | torch.Tensor
Variable to predict.
groups : torch.Tensor | None, optional
groups : np.array | torch.Tensor | None, optional
Groups to split by, default None. If None is passed, then does standard KFold,
otherwise implements GroupShuffleSplit (if shuffle is True),
or GroupKFold (if shuffle is False).
Expand Down Expand Up @@ -303,9 +328,9 @@ class Folds:

def __init__(
self,
x_data: torch.Tensor,
y_data: torch.Tensor,
groups: torch.Tensor | None = None,
x_data: np.array | torch.Tensor,
y_data: np.array | torch.Tensor,
groups: np.array | torch.Tensor | None = None,
n_splits: int = 5,
valid_size: float | None = 0.33,
indices: tuple[tuple[Iterable[int], Iterable[int] | None, Iterable[int]]]
Expand All @@ -314,22 +339,22 @@ def __init__(
random_state: int = 42,
):
if n_splits < 2:
msg = "n_splits should be at least 2"
msg = "n_splits should be at least 2."
raise ValueError(msg)
if x_data.shape[0] != y_data.shape[0]:
msg = (
"x_data and y_data do not have compatible shapes "
"(need to have same number of samples)"
"(need to have same number of samples)."
)
raise ValueError(msg)
if groups is not None and x_data.shape[0] != len(groups):
msg = (
"x_data and groups do not have compatible shapes "
"(need to have same number of samples)"
"(need to have same number of samples)."
)
raise ValueError(msg)
if valid_size is not None and ((valid_size < 0) or (valid_size > 1)):
msg = "valid_size must be between 0 and 1"
msg = "valid_size must be between 0 and 1."
raise ValueError(msg)

self.x_data = x_data
Expand All @@ -344,15 +369,15 @@ def __init__(

# indices are provided, so use these to split the dataset
# check that indices are a tuple of length k
msg = "if indices are provided, it must be a tuple of length n_splits"
msg = "if indices are provided, it must be a tuple of length n_splits."
if not isinstance(indices, tuple):
raise TypeError(msg)
if len(indices) != self.n_splits:
raise ValueError(msg)

for k in range(self.n_splits):
# check that indices[k] is a tuple of length 3
msg = "each item in indices must be a tuple of length 3"
msg = f"each item in indices must be a tuple of length 3: fold {k} is not."
if not isinstance(indices[k], tuple):
raise TypeError(msg)
if len(indices[k]) != 3:
Expand All @@ -368,7 +393,7 @@ def __init__(
)
msg = (
f"in the {problem_set} indices, for fold {k}, "
"some of the indices will be out of range"
"some of the indices will be out of range."
)
raise IndexError(msg)

Expand Down Expand Up @@ -418,6 +443,8 @@ def __init__(
# store indices
self.fold_indices[k] = (train_index, valid_index, test_index)

self.fold_indices = tuple(self.fold_indices)

def get_splits(
self,
fold_index: int,
Expand All @@ -426,12 +453,12 @@ def get_splits(
) -> (
tuple[DataLoader, DataLoader, DataLoader]
| tuple[
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
np.array | torch.Tensor,
]
):
"""
Expand All @@ -451,13 +478,13 @@ def get_splits(
Returns
-------
tuple[DataLoader] | tuple[torch.Tensor]
tuple[DataLoader] | tuple[np.array | torch.Tensor]
If `as_DataLoader` is True, return tuple of `torch.utils.data.dataloader.DataLoader` objects:
- First element is training dataset
- Second element is validation dataset
- Third element is testing dataset
If `as_DataLoader` is False, returns tuple of `torch.Tensors`:
If `as_DataLoader` is False, returns tuple of either `numpy.array`s or `torch.Tensor`s:
- First element is features for training dataset
- Second element is labels for training dataset
- Third element is features for validation dataset
Expand Down Expand Up @@ -503,11 +530,27 @@ def get_splits(
if as_DataLoader:
# return datasets as DataLoader objects if requested
if valid_index is not None:
# make sure that x_valid and y_valid are torch tensors
if isinstance(x_valid, np.ndarray):
x_valid = torch.from_numpy(x_valid)
if isinstance(y_valid, np.ndarray):
y_valid = torch.from_numpy(y_valid)

valid = TensorDataset(x_valid, y_valid)
valid_loader = DataLoader(dataset=valid, **data_loader_args)
else:
valid_loader = None

# make sure that x_train, y_train, x_test and y_test are torch tensors
if isinstance(x_train, np.ndarray):
x_train = torch.from_numpy(x_train)
if isinstance(y_train, np.ndarray):
y_train = torch.from_numpy(y_train)
if isinstance(x_test, np.ndarray):
x_test = torch.from_numpy(x_test)
if isinstance(y_test, np.ndarray):
y_test = torch.from_numpy(y_test)

train = TensorDataset(x_train, y_train)
test = TensorDataset(x_test, y_test)
train_loader = DataLoader(dataset=train, **data_loader_args)
Expand Down
Loading

0 comments on commit f88c81e

Please sign in to comment.