diff --git a/aeon/classification/dictionary_based/_tde_fast.py b/aeon/classification/dictionary_based/_tde_fast.py new file mode 100644 index 0000000000..3f1da743c3 --- /dev/null +++ b/aeon/classification/dictionary_based/_tde_fast.py @@ -0,0 +1,918 @@ +# -*- coding: utf-8 -*- +"""TDE classifiers. + +Dictionary based TDE classifiers based on SFA transform. Contains a single +IndividualTDE and TDE. +""" + +__author__ = ["MatthewMiddlehurst", "patrickzib"] +__all__ = ["FastTemporalDictionaryEnsemble", "IndividualTDE", "histogram_intersection"] + +import math +import time +import warnings + +import numpy as np +from joblib import Parallel, delayed +from numba import njit, types +from scipy.sparse import csr_matrix, hstack +from sklearn import preprocessing +from sklearn.kernel_ridge import KernelRidge +from sklearn.utils import check_random_state + +from aeon.classification.base import BaseClassifier +from aeon.transformations.panel.dictionary_based import SFAFast +from aeon.utils.validation.panel import check_X_y + + +class FastTemporalDictionaryEnsemble(BaseClassifier): + """Temporal Dictionary Ensemble (TDE). + + Implementation of the dictionary based Temporal Dictionary Ensemble as described + in [1]_. + + Overview: Input "n" series length "m" with "d" dimensions + TDE searches "k" parameter values selected using a Gaussian processes + regressor, evaluating each with a LOOCV. It then retains "s" + ensemble members. + There are six primary parameters for individual classifiers: + - alpha: alphabet size + - w: window length + - l: word length + - p: normalise/no normalise + - h: levels + - b: MCB/IGB + For any combination, an individual TDE classifier slides a window of + length w along the series. The w length window is shortened to + an l length word through taking a Fourier transform and keeping the + first l/2 complex coefficients. These lcoefficients are then discretised + into alpha possible values, to form a word length l using breakpoints + found using b. A histogram of words for each series is formed and stored, + using a spatial pyramid of h levels. For multivariate series, accuracy + from a reduced histogram is used to select dimensions. + + fit involves finding n histograms. + predict uses 1 nearest neighbour with a histogram intersection + distance function. + + Parameters + ---------- + n_parameter_samples : int, default=250 + Number of parameter combinations to consider for the final ensemble. + max_ensemble_size : int, default=50 + Maximum number of estimators in the ensemble. + max_win_len_prop : float, default=1 + Maximum window length as a proportion of series length, must be between 0 and 1. + min_window : int, default=10 + Minimum window length. + randomly_selected_params: int, default=50 + Number of parameters randomly selected before the Gaussian process parameter + selection is used. + bigrams : boolean or None, default=None + Whether to use bigrams, defaults to true for univariate data and false for + multivariate data. + dim_threshold : float, default=0.85 + Dimension accuracy threshold for multivariate data, must be between 0 and 1. + max_dims : int, default=20 + Max number of dimensions per classifier for multivariate data. + time_limit_in_minutes : int, default=0 + Time contract to limit build time in minutes, overriding n_parameter_samples. + Default of 0 means n_parameter_samples is used. + contract_max_n_parameter_samples : int, default=np.inf + Max number of parameter combinations to consider when time_limit_in_minutes is + set. + typed_dict : bool, default=True + Use a numba typed Dict to store word counts. May increase memory usage, but will + be faster for larger datasets. As the Dict cannot be pickled currently, there + will be some overhead converting it to a python dict with multiple threads and + pickling. + save_train_predictions : bool, default=False + Save the ensemble member train predictions in fit for use in _get_train_probs + leave-one-out cross-validation. + n_jobs : int, default=1 + The number of jobs to run in parallel for both `fit` and `predict`. + ``-1`` means using all processors. + random_state : int or None, default=None + Seed for random number generation. + + Attributes + ---------- + n_classes_ : int + The number of classes. + classes_ : list + The classes labels. + n_instances_ : int + The number of train cases. + n_dims_ : int + The number of dimensions per case. + series_length_ : int + The length of each series. + n_estimators_ : int + The final number of classifiers used (<= max_ensemble_size) + estimators_ : list of shape (n_estimators) of IndividualTDE + The collections of estimators trained in fit. + weights_ : list of shape (n_estimators) of float + Weight of each estimator in the ensemble. + + See Also + -------- + IndividualTDE, ContractableBOSS + + Notes + ----- + For the Java version, see + `TSML `_. + + References + ---------- + .. [1] Matthew Middlehurst, James Large, Gavin Cawley and Anthony Bagnall + "The Temporal Dictionary Ensemble (TDE) Classifier for Time Series + Classification", in proceedings of the European Conference on Machine Learning + and Principles and Practice of Knowledge Discovery in Databases, 2020. + + Examples + -------- + >>> from aeon.classification.dictionary_based import TemporalDictionaryEnsemble + >>> from aeon.datasets import load_unit_test + >>> X_train, y_train = load_unit_test(split="train", return_X_y=True) + >>> X_test, y_test = load_unit_test(split="test", return_X_y=True) + >>> clf = TemporalDictionaryEnsemble( + ... n_parameter_samples=10, + ... max_ensemble_size=3, + ... randomly_selected_params=5, + ... ) + >>> clf.fit(X_train, y_train) + TemporalDictionaryEnsemble(...) + >>> y_pred = clf.predict(X_test) + """ + + _tags = { + "capability:multivariate": False, + "capability:train_estimate": True, + "capability:contractable": True, + "capability:multithreading": True, + "classifier_type": "dictionary", + } + + def __init__( + self, + n_parameter_samples=250, + max_ensemble_size=50, + max_win_len_prop=1, + min_window=10, + randomly_selected_params=50, + bigrams=None, + dim_threshold=0.85, + max_dims=20, + time_limit_in_minutes=0.0, + contract_max_n_parameter_samples=np.inf, + typed_dict=True, + save_train_predictions=False, + n_jobs=1, + random_state=None, + ): + self.n_parameter_samples = n_parameter_samples + self.max_ensemble_size = max_ensemble_size + self.max_win_len_prop = max_win_len_prop + self.min_window = min_window + self.randomly_selected_params = randomly_selected_params + self.bigrams = bigrams + + # multivariate + self.dim_threshold = dim_threshold + self.max_dims = max_dims + + self.time_limit_in_minutes = time_limit_in_minutes + self.contract_max_n_parameter_samples = contract_max_n_parameter_samples + self.typed_dict = typed_dict + self.save_train_predictions = save_train_predictions + self.random_state = random_state + self.n_jobs = n_jobs + + self.n_instances_ = 0 + self.n_dims_ = 0 + self.series_length_ = 0 + self.n_estimators_ = 0 + self.estimators_ = [] + self.weights_ = [] + + self._word_lengths = [16, 14, 12, 10, 8] + self._norm_options = [True, False] + self._levels = [1, 2, 3] + self._igb_options = [True, False] + self._alphabet_size = 4 + self._weight_sum = 0 + self._prev_parameters_x = [] + self._prev_parameters_y = [] + self._min_window = min_window + + super(FastTemporalDictionaryEnsemble, self).__init__() + + def _fit(self, X, y): + """Fit an ensemble on cases (X,y), where y is the target variable. + + Build an ensemble of base TDE classifiers from the training set (X, + y), through an optimised selection over the para space to make a fixed size + ensemble of the best. + + Parameters + ---------- + X : 3D np.array of shape = [n_instances, n_dimensions, series_length] + The training data. + y : array-like, shape = [n_instances] + The class labels. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Changes state by creating a fitted model that updates attributes + ending in "_" and sets is_fitted flag to True. + """ + if self.n_parameter_samples <= self.randomly_selected_params: + warnings.warn( + "TemporalDictionaryEnsemble warning: n_parameter_samples <= " + "randomly_selected_params, ensemble member parameters will be fully " + "randomly selected.", + stacklevel=1, + ) + + self.n_instances_, self.n_dims_, self.series_length_ = X.shape + + self.estimators_ = [] + self.weights_ = [] + self._prev_parameters_x = [] + self._prev_parameters_y = [] + + # Window length parameter space dependent on series length + max_window_searches = self.series_length_ / 4 + max_window = int(self.series_length_ * self.max_win_len_prop) + + if self.min_window >= max_window: + self._min_window = max_window + warnings.warn( + f"TemporalDictionaryEnsemble warning: min_window = " + f"{self.min_window} is larger than max_window = {max_window}." + f" min_window has been set to {max_window}.", + stacklevel=1, + ) + + win_inc = int((max_window - self._min_window) / max_window_searches) + if win_inc < 1: + win_inc = 1 + + possible_parameters = self._unique_parameters(max_window, win_inc) + num_classifiers = 0 + subsample_size = int(self.n_instances_ * 0.7) + lowest_acc = 1 + lowest_acc_idx = 0 + + time_limit = self.time_limit_in_minutes * 60 + start_time = time.time() + train_time = 0 + if time_limit > 0: + n_parameter_samples = 0 + contract_max_n_parameter_samples = self.contract_max_n_parameter_samples + else: + n_parameter_samples = self.n_parameter_samples + contract_max_n_parameter_samples = np.inf + + rng = check_random_state(self.random_state) + + if self.bigrams is None: + if self.n_dims_ > 1: + use_bigrams = False + else: + use_bigrams = True + else: + use_bigrams = self.bigrams + + # use time limit or n_parameter_samples if limit is 0 + while ( + ( + train_time < time_limit + and num_classifiers < contract_max_n_parameter_samples + ) + or num_classifiers < n_parameter_samples + ) and len(possible_parameters) > 0: + if num_classifiers < self.randomly_selected_params: + parameters = possible_parameters.pop( + rng.randint(0, len(possible_parameters)) + ) + else: + scaler = preprocessing.StandardScaler() + scaler.fit(self._prev_parameters_x) + gp = KernelRidge(kernel="poly", degree=1) + gp.fit( + scaler.transform(self._prev_parameters_x), self._prev_parameters_y + ) + preds = gp.predict(scaler.transform(possible_parameters)) + parameters = possible_parameters.pop( + rng.choice(np.flatnonzero(preds == preds.max())) + ) + + subsample = rng.choice( + self.n_instances_, size=subsample_size, replace=False + ) + X_subsample = X[subsample] + y_subsample = y[subsample] + + tde = IndividualTDE( + *parameters, + alphabet_size=self._alphabet_size, + bigrams=use_bigrams, + dim_threshold=self.dim_threshold, + max_dims=self.max_dims, + typed_dict=self.typed_dict, + n_jobs=self._threads_to_use, + random_state=self.random_state, + ) + tde.fit(X_subsample, y_subsample) + tde._subsample = subsample + + tde._accuracy = self._individual_train_acc( + tde, + y_subsample, + subsample_size, + 0 if num_classifiers < self.max_ensemble_size else lowest_acc, + ) + if tde._accuracy > 0: + weight = math.pow(tde._accuracy, 4) + else: + weight = 0.000000001 + + if num_classifiers < self.max_ensemble_size: + if tde._accuracy < lowest_acc: + lowest_acc = tde._accuracy + lowest_acc_idx = num_classifiers + self.weights_.append(weight) + self.estimators_.append(tde) + elif tde._accuracy > lowest_acc: + self.weights_[lowest_acc_idx] = weight + self.estimators_[lowest_acc_idx] = tde + lowest_acc, lowest_acc_idx = self._worst_ensemble_acc() + + self._prev_parameters_x.append(parameters) + self._prev_parameters_y.append(tde._accuracy) + + num_classifiers += 1 + train_time = time.time() - start_time + + self.n_estimators_ = len(self.estimators_) + self._weight_sum = np.sum(self.weights_) + + return self + + def _predict(self, X) -> np.ndarray: + """Predict class values of n instances in X. + + Parameters + ---------- + X : 3D np.array of shape = [n_instances, n_dimensions, series_length] + The data to make predictions for. + + Returns + ------- + y : array-like, shape = [n_instances] + Predicted class labels. + """ + rng = check_random_state(self.random_state) + return np.array( + [ + self.classes_[int(rng.choice(np.flatnonzero(prob == prob.max())))] + for prob in self._predict_proba(X) + ] + ) + + def _predict_proba(self, X) -> np.ndarray: + """Predict class probabilities for n instances in X. + + Parameters + ---------- + X : 3D np.array of shape = [n_instances, n_dimensions, series_length] + The data to make predict probabilities for. + + Returns + ------- + y : array-like, shape = [n_instances, n_classes_] + Predicted probabilities using the ordering in classes_. + """ + _, _, series_length = X.shape + if series_length != self.series_length_: + raise TypeError( + "ERROR number of attributes in the train does not match " + "that in the test data" + ) + + sums = np.zeros((X.shape[0], self.n_classes_)) + + for n, clf in enumerate(self.estimators_): + preds = clf.predict(X) + for i in range(0, X.shape[0]): + sums[i, self._class_dictionary[preds[i]]] += self.weights_[n] + + return sums / (np.ones(self.n_classes_) * self._weight_sum) + + def _worst_ensemble_acc(self): + min_acc = 1.0 + min_acc_idx = 0 + + for c, classifier in enumerate(self.estimators_): + if classifier._accuracy < min_acc: + min_acc = classifier._accuracy + min_acc_idx = c + + return min_acc, min_acc_idx + + def _unique_parameters(self, max_window, win_inc): + possible_parameters = [ + [win_size, word_len, normalise, levels, igb] + for normalise in self._norm_options + for win_size in range(self._min_window, max_window + 1, win_inc) + for word_len in self._word_lengths + for levels in self._levels + for igb in self._igb_options + ] + + return possible_parameters + + def _get_train_probs(self, X, y, train_estimate_method="loocv") -> np.ndarray: + self.check_is_fitted() + X, y = check_X_y(X, y, coerce_to_numpy=True) + + n_instances, n_dims, series_length = X.shape + + if ( + n_instances != self.n_instances_ + or n_dims != self.n_dims_ + or series_length != self.series_length_ + ): + raise ValueError( + "n_instances, n_dims, series_length mismatch. X should be " + "the same as the training data used in fit for generating train " + "probabilities." + ) + + results = np.zeros((n_instances, self.n_classes_)) + divisors = np.zeros(n_instances) + + if train_estimate_method.lower() == "loocv": + for i, clf in enumerate(self.estimators_): + subsample = clf._subsample + preds = ( + clf._train_predictions + if self.save_train_predictions + else Parallel(n_jobs=self._threads_to_use, prefer="threads")( + delayed(clf._train_predict)( + i, + ) + for i in range(len(subsample)) + ) + ) + + for n, pred in enumerate(preds): + results[subsample[n]][ + self._class_dictionary[pred] + ] += self.weights_[i] + divisors[subsample[n]] += self.weights_[i] + elif train_estimate_method.lower() == "oob": + indices = range(n_instances) + for i, clf in enumerate(self.estimators_): + oob = [n for n in indices if n not in clf._subsample] + + if len(oob) == 0: + continue + + preds = clf.predict(X[oob]) + + for n, pred in enumerate(preds): + results[oob[n]][self._class_dictionary[pred]] += self.weights_[i] + divisors[oob[n]] += self.weights_[i] + else: + raise ValueError( + "Invalid train_estimate_method. Available options: loocv, oob" + ) + + for i in range(n_instances): + results[i] = ( + np.ones(self.n_classes_) * (1 / self.n_classes_) + if divisors[i] == 0 + else results[i] / (np.ones(self.n_classes_) * divisors[i]) + ) + + return results + + def _individual_train_acc(self, tde, y, train_size, lowest_acc): + correct = 0 + required_correct = int(lowest_acc * train_size) + + if self._threads_to_use > 1: + c = Parallel(n_jobs=self._threads_to_use, prefer="threads")( + delayed(tde._train_predict)( + i, + ) + for i in range(train_size) + ) + + for i in range(train_size): + if correct + train_size - i < required_correct: + return -1 + elif c[i] == y[i]: + correct += 1 + + if self.save_train_predictions: + tde._train_predictions.append(c[i]) + + else: + for i in range(train_size): + if correct + train_size - i < required_correct: + return -1 + + c = tde._train_predict(i) + + if c == y[i]: + correct += 1 + + if self.save_train_predictions: + tde._train_predictions.append(c) + + return correct / train_size + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + For classifiers, a "default" set of parameters should be provided for + general testing, and a "results_comparison" set for comparing against + previously recorded results if the general set does not produce suitable + probabilities to compare against. + + Returns + ------- + params : dict or list of dict, default={} + Parameters to create testing instances of the class. + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params`. + """ + if parameter_set == "results_comparison": + return { + "n_parameter_samples": 10, + "max_ensemble_size": 5, + "randomly_selected_params": 5, + } + else: + return { + "n_parameter_samples": 5, + "max_ensemble_size": 2, + "randomly_selected_params": 3, + "save_train_predictions": True, + } + + +class IndividualTDE(BaseClassifier): + """Single TDE classifier, an extension of the Bag of SFA Symbols (BOSS) model. + + Base classifier for the TDE classifier. Implementation of single TDE base model + from Middlehurst (2021). [1]_ + + Overview: input "n" series of length "m" and IndividualTDE performs a SFA + transform to form a sparse dictionary of discretised words. The resulting + dictionary is used with the histogram intersection distance function in a + 1-nearest neighbor. + + fit involves finding "n" histograms. + + predict uses 1 nearest neighbor with the histogram intersection distance function. + + Parameters + ---------- + window_size : int, default=10 + Size of the window to use in the SFA transform. + word_length : int, default=8 + Length of word to use to use in the SFA transform. + norm : bool, default=False + Whether to normalize SFA words by dropping the first Fourier coefficient. + levels : int, default=1 + The number of spatial pyramid levels for the SFA transform. + igb : bool, default=False + Whether to use Information Gain Binning (IGB) or + Multiple Coefficient Binning (MCB) for the SFA transform. + alphabet_size : default=4 + Number of possible letters (values) for each word. + bigrams : bool, default=True + Whether to record word bigrams in the SFA transform. + dim_threshold : float, default=0.85 + Accuracy threshold as a propotion of the highest accuracy dimension for words + extracted from each dimensions. Only applicable for multivariate data. + max_dims : int, default=20 + Maximum number of dimensions words are extracted from. Only applicable for + multivariate data. + typed_dict : bool, default=True + Use a numba TypedDict to store word counts. May increase memory usage, but will + be faster for larger datasets. + n_jobs : int, default=1 + The number of jobs to run in parallel for both `fit` and `predict`. + ``-1`` means using all processors. + random_state : int or None, default=None + Seed for random, integer. + + Attributes + ---------- + n_classes_ : int + The number of classes. + classes_ : list + The classes labels. + n_instances_ : int + The number of train cases. + n_dims_ : int + The number of dimensions per case. + series_length_ : int + The length of each series. + + See Also + -------- + FastTemporalDictinaryEnsemble, SFA + + Notes + ----- + For the Java version, see + `TSML `_. + + References + ---------- + .. [1] Matthew Middlehurst, James Large, Gavin Cawley and Anthony Bagnall + "The Temporal Dictionary Ensemble (TDE) Classifier for Time Series + Classification", in proceedings of the European Conference on Machine Learning + and Principles and Practice of Knowledge Discovery in Databases, 2020. + + Examples + -------- + >>> from aeon.classification.dictionary_based import IndividualTDE + >>> from aeon.datasets import load_unit_test + >>> X_train, y_train = load_unit_test(split="train", return_X_y=True) + >>> X_test, y_test = load_unit_test(split="test", return_X_y=True) + >>> clf = IndividualTDE() + >>> clf.fit(X_train, y_train) + IndividualTDE(...) + >>> y_pred = clf.predict(X_test) + """ + + _tags = { + "capability:multivariate": False, + "capability:multithreading": True, + } + + def __init__( + self, + window_size=10, + word_length=8, + norm=False, + levels=1, + igb=False, + alphabet_size=4, + bigrams=True, + dim_threshold=0.85, + max_dims=20, + typed_dict=True, + n_jobs=1, + random_state=None, + ): + self.window_size = window_size + self.word_length = word_length + self.norm = norm + self.levels = levels + self.igb = igb + self.alphabet_size = alphabet_size + self.bigrams = bigrams + + # multivariate + self.dim_threshold = dim_threshold + self.max_dims = max_dims + + self.typed_dict = typed_dict + self.n_jobs = n_jobs + self.random_state = random_state + + self.n_instances_ = 0 + self.n_dims_ = 0 + self.series_length_ = 0 + + self._transformers = [] + self._transformed_data = [] + self._class_vals = [] + self._dims = [] + self._highest_dim_bit = 0 + self._accuracy = 0 + self._subsample = [] + self._train_predictions = [] + + super(IndividualTDE, self).__init__() + + def _fit(self, X, y): + """Fit a single base TDE classifier on n_instances cases (X,y). + + Parameters + ---------- + X : 3D np.array of shape = [n_instances, n_dimensions, series_length] + The training data. + y : array-like, shape = [n_instances] + The class labels. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Changes state by creating a fitted model that updates attributes + ending in "_" and sets is_fitted flag to True. + """ + self.n_instances_, self.n_dims_, self.series_length_ = X.shape + self._class_vals = y + + self._transformers.append( + SFAFast( + word_length=self.word_length, + alphabet_size=self.alphabet_size, + window_size=self.window_size, + norm=self.norm, + # levels=self.levels, + binning_method="information-gain" if self.igb else "equi-depth", + bigrams=self.bigrams, + remove_repeat_words=True if self.levels == 1 else False, + lower_bounding=False, + save_words=True if self.levels > 1 else False, # only needed for levels + n_jobs=self._threads_to_use, + ) + # SFA( + # word_length=self.word_length, + # alphabet_size=self.alphabet_size, + # window_size=self.window_size, + # norm=self.norm, + # levels=self.levels, + # binning_method="information-gain" if self.igb else "equi-depth", + # bigrams=self.bigrams, + # remove_repeat_words=True, + # lower_bounding=False, + # save_words=False, + # use_fallback_dft=True, + # typed_dict=self.typed_dict, + # n_jobs=self._threads_to_use, + # ) + ) + bag = self._transformers[0].fit_transform(X, y) + + # adding pyramids from raw words + if self.levels > 1: + bag = self._add_to_pyramid() + + self._transformed_data = bag + + def _add_to_pyramid(self): + SFA = self._transformers[0] + words = SFA.words + + sfa_words = [] + for lev in range(1, self.levels): + # number of quadrants equal to pow(2, level-1) + level = pow(2, lev - 1) + for i in range(0, level): + quadrant_words = words[ + :, + int(i * words.shape[-1] / level) : int( + (i + 1) * words.shape[-1] / level + ), + ] + + if quadrant_words.shape[-1] > 0: + sfa_words.append( + SFA.transform_to_bag( + quadrant_words, SFA.word_length_actual, None + ) + ) + + if type(sfa_words[0]) is np.ndarray: + all_words = np.concatenate(sfa_words, axis=1) + else: + all_words = hstack(sfa_words) + + return all_words + + def _predict(self, X): + """Predict class values of all instances in X. + + Parameters + ---------- + X : 3D np.array of shape = [n_instances, n_dimensions, series_length] + The data to make predictions for. + + Returns + ------- + y : array-like, shape = [n_instances] + Predicted class labels. + """ + test_bags = self._transformers[0].transform(X) + + # adding pyramids from raw words + if self.levels > 1: + test_bags = self._add_to_pyramid() + + classes = Parallel(n_jobs=self._threads_to_use, prefer="threads")( + delayed(self._test_nn)( + test_bag, + ) + for test_bag in test_bags + ) + + return np.array(classes) + + def _test_nn(self, test_bag): + rng = check_random_state(self.random_state) + + best_sim = -1 + nn = None + + for n, bag in enumerate(self._transformed_data): + sim = histogram_intersection(test_bag, bag) + + if sim > best_sim or (sim == best_sim and rng.random() < 0.5): + best_sim = sim + nn = self._class_vals[n] + + return nn + + def _train_predict(self, train_num, bags=None): + if bags is None: + bags = self._transformed_data + + test_bag = bags[train_num] + best_sim = -1 + nn_label = None + + for n, bag in enumerate(bags): + if n == train_num: + continue + + sim = histogram_intersection(test_bag, bag) + + if sim > best_sim: + best_sim = sim + nn_label = self._class_vals[n] + + return nn_label + + +def histogram_intersection(first, second): + """Find the distance between two histograms using the histogram intersection. + + This distance function is designed for sparse matrix, represented as a + dictionary or numba Dict, but can accept arrays. + + Parameters + ---------- + first : dict, numba.Dict or array + First dictionary used in distance measurement. + second : dict, numba.Dict or array + Second dictionary that will be used to measure distance from `first`. + + Returns + ------- + dist : float + The histogram intersection distance between the first and second dictionaries. + """ + if isinstance(first, csr_matrix): # csr matrix + return hist_intersection(first, second) + else: # numpy array ? + return np.sum( + [ + 0 if first[n] == 0 else np.min(first[n], second[n]) + for n in range(len(first)) + ] + ) + + +@njit(fastmath=True, cache=True) +def _histogram_intersection_dict(first, second): + sim = 0 + for word, val_a in first.items(): + val_b = second.get(word, types.uint32(0)) + sim += min(val_a, val_b) + return sim + + +def hist_intersection(X, Y): + """Compute Histogram Intersection for two scipy csr_matrix.""" + mask = Y.nonzero()[-1] + return np.sum(X[:, mask].minimum(Y)) + np.sum(X[:, mask]) diff --git a/aeon/transformations/collection/dictionary_based/_sfa.py b/aeon/transformations/collection/dictionary_based/_sfa.py index 9bf638e3d5..2358b11626 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa.py +++ b/aeon/transformations/collection/dictionary_based/_sfa.py @@ -3,7 +3,7 @@ Configurable SFA transform for discretising time series into words. """ -__maintainer__ = [] +__maintainer__ = ["patrickzib", "MatthewMiddlehurst"] __all__ = ["SFA"] import math @@ -21,13 +21,14 @@ from aeon.transformations.collection import BaseCollectionTransformer -# The binning methods to use: equi-depth, equi-width, information gain or kmeans +# The binning methods to use binning_methods = { "equi-depth", "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile", } @@ -40,47 +41,42 @@ class SFA(BaseCollectionTransformer): shorten the series with DFT discretise the shortened series into bins set by MFC form a word from these discrete values - by default SFA produces a single word per series (window_size=0) - if a window is used, it forms a histogram of counts of words. + SFA returns a dictionary of word counts for each series + + This is a slower but more flexible version of the SFA transform, which can store + greater than 64 bit words. This is at the cost of efficiency, however. Parameters ---------- - word_length: int, default = 8 - length of word to shorten window to (using PAA) - - alphabet_size: int, default = 4 - number of values to discretise each value to - - window_size: int, default = 12 - size of window for sliding. Input series - length for whole series transform - - norm: boolean, default = False - mean normalise words by dropping first fourier coefficient - - binning_method: {"equi-depth", "equi-width", "information-gain", - "information-gain-mae", "kmeans"}, default="equi-depth" - the binning method used to derive the breakpoints. - - anova: boolean, default = False - If True, the Fourier coefficient selection is done via a one-way - ANOVA test. If False, the first Fourier coefficients are selected. - Only applicable if labels are given - - bigrams: boolean, default = False - whether to create bigrams of SFA words - - skip_grams: boolean, default = False - whether to create skip-grams of SFA words - - remove_repeat_words: boolean, default = False - whether to use numerosity reduction (default False) + word_length : int, default=8 + Length of word to shorten window to (using PAA). + alphabet_size : int, default=4 + Number of values to discretise each value to. + window_size : int, default=12 + Size of window for sliding. Input series length for whole series transform. + norm : boolean, default=False + Mean normalise words by dropping first fourier coefficient. + binning_method : str, default="equi-depth" + The binning method used to derive the breakpoints. One of {"equi-depth", + "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile"}. + anova : boolean, default=False + If True, the Fourier coefficient selection is done via a one-way ANOVA test. + If False, the first Fourier coefficients are selected. Only applicable if + labels are given. + bigrams : boolean, default=False + Whether to create bigrams of SFA words. + skip_grams : boolean, default=False + Whether to create skip-grams of SFA words. + levels: int, default=1 + Number of spatial pyramid levels + remove_repeat_words : boolean, default=False + Whether to use numerosity reduction. lower_bounding_distances : boolean, default = None If set to True, the FFT is normed to allow for ED lower bounding. - levels: int, default = 1 - Number of spatial pyramid levels + save_words: boolean, default = False whether to save the words generated for each series (default False) @@ -123,8 +119,8 @@ def __init__( anova=False, bigrams=False, skip_grams=False, - remove_repeat_words=False, levels=1, + remove_repeat_words=False, lower_bounding=True, lower_bounding_distances=None, save_words=False, diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 91d0e326e9..cf1e85f48e 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -1,25 +1,17 @@ """Symbolic Fourier Approximation (SFA) Transformer. -Configurable SFA transform for discretising time series into words. - +Efficient but rigid SFA transform for discretising time series into words. """ -__maintainer__ = [] +__maintainer__ = ["patrickzib", "MatthewMiddlehurst"] __all__ = ["SFAFast"] import math import sys -from warnings import simplefilter import numpy as np import pandas as pd -from numba import ( - NumbaPendingDeprecationWarning, - NumbaTypeSafetyWarning, - njit, - objmode, - prange, -) +from numba import njit, objmode, prange from numba.core import types from numba.typed import Dict from scipy.sparse import csr_matrix @@ -30,7 +22,7 @@ from aeon.transformations.collection import BaseCollectionTransformer -# The binning methods to use: equi-depth, equi-width, information gain or kmeans +# The binning methods to use binning_methods = { "equi-depth", "equi-width", @@ -40,9 +32,6 @@ "quantile", } -simplefilter(action="ignore", category=NumbaPendingDeprecationWarning) -simplefilter(action="ignore", category=NumbaTypeSafetyWarning) - class SFAFast(BaseCollectionTransformer): """Symbolic Fourier Approximation (SFA) Transformer. @@ -53,37 +42,44 @@ class SFAFast(BaseCollectionTransformer): shorten the series with DFT discretise the shortened series into bins set by MFC form a word from these discrete values - by default SFA produces a single word per series (window_size=0) - if a window is used, it forms a histogram of counts of words. + SFA returns an array of word counts for each series consisting of a column for + each word found in fit. + + This is a faster but more rigid version of the SFA transform, which can only use + up to 64 bit words and does not store the actual words found in its transformed + array. Parameters ---------- - word_length : int, default = 8 + word_length : int, default=8 Length of word to shorten window to (using PAA). - alphabet_size : int, default = 4 + alphabet_size : int, default=4 Number of values to discretise each value to. - window_size : int, default = 12 + window_size : int, default=12 Size of window for sliding. Input series length for whole series transform. - norm : boolean, default = False + norm : boolean, default=False Mean normalise words by dropping first fourier coefficient. binning_method : str, default="equi-depth" The binning method used to derive the breakpoints. One of {"equi-depth", - "equi-width", "information-gain", "information-gain-mae", "kmeans"}, - anova : boolean, default = False + "equi-width", "information-gain", "information-gain-mae", "kmeans", + "quantile"}. + anova : boolean, default=False If True, the Fourier coefficient selection is done via a one-way ANOVA test. If False, the first Fourier coefficients are selected. Only applicable if labels are given. - variance : boolean, default = False + variance : boolean, default=False If True, the Fourier coefficient selection is done via the largest variance. If False, the first Fourier coefficients are selected. Only applicable if labels are given. + bigrams : boolean, default=False + Whether to create bigrams of SFA words. + skip_grams : boolean, default=False + Whether to create skip-grams of SFA words. dilation : int, default = 0 When set to dilation > 1, adds dilation to the sliding window operation. - save_words : boolean, default = False - whether to save the words generated for each series (default False) - bigrams : boolean, default = False - Whether to create bigrams of SFA words. - feature_selection : {"chi2", "chi2_top_k", "none", "random"}, default: none + remove_repeat_words : boolean, default=False + Whether to use numerosity reduction. + feature_selection : {"chi2", "chi2_top_k", "random", None}, default=None Sets the feature selections strategy to be used. Large amounts of memory may be needed depending on the setting of bigrams (true is more) or alpha (larger is more). @@ -92,19 +88,20 @@ class SFAFast(BaseCollectionTransformer): dropping values based on p-value. 'random' reduces the number to at most 'max_feature_count', by randomly selecting features. - 'none' does not apply any feature selection and yields large bag of words, - p_threshold : int, default=0.05 (disabled by default) + None does not apply any feature selection and yields large bag of words, + p_threshold : float, default=0.05 If feature_selection=chi2 is chosen, feature selection is applied based on the chi-squared test. This is the p-value threshold to use for chi-squared test on bag-of-words (lower means more strict). 1 indicates that the test should not be performed. - max_feature_count : int, default=256 + max_feature_count : int, default=256 If feature_selection=random is chosen, this parameter defines the number of randomly chosen unique words used. - skip_grams : boolean, default = False - Whether to create skip-grams of SFA words. - remove_repeat_words : boolean, default = False - Whether to use numerosity reduction. + + + + save_words : boolean, default = False + whether to save the words generated for each series (default False) lower_bounding_distances : boolean, default = None If set to True, the FFT is normed to allow for ED lower bounding. return_sparse : boolean, default=True @@ -351,9 +348,9 @@ def _transform(self, X, y=None): self.lower_bounding or self.lower_bounding_distances, ) - # only save at fit - # if self.save_words: - # self.words = words + # TODO only save at fit? + if self.save_words: + self.words = words # transform: applies the feature selection strategy empty_dict = Dict.empty(