Skip to content

Commit

Permalink
Merge pull request #11 from datasig-ac-uk/transform-euclidean
Browse files Browse the repository at this point in the history
Transform euclidean
  • Loading branch information
rchan26 authored Nov 24, 2023
2 parents 15ef7f7 + 56fff8f commit ee766b8
Show file tree
Hide file tree
Showing 9 changed files with 4,077 additions and 80 deletions.
5 changes: 0 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ repos:
- id: ruff
args: ["--fix", "--show-fixes"]

- repo: https://github.com/codespell-project/codespell
rev: "v2.2.6"
hooks:
- id: codespell

- repo: https://github.com/shellcheck-py/shellcheck-py
rev: "v0.9.0.6"
hooks:
Expand Down
235 changes: 235 additions & 0 deletions paper-examples/Data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from __future__ import annotations

import pickle
import random
from pathlib import Path

import numpy as np
import pandas as pd
import sklearn
from scipy.io import arff
from sklearn.model_selection import train_test_split

DATA_DIR = "data/"


def get_corpus_and_outlier_paths(df, desired_class):
corpus_paths = []
outlier_paths = []

for i in range(df.shape[0]):
if df.iloc[i]["target"] == desired_class:
corpus_paths.append(
np.column_stack([list(range(df.shape[1] - 1)), df.iloc[i][:-1]])
)
else:
outlier_paths.append(
np.column_stack([list(range(df.shape[1] - 1)), df.iloc[i][:-1]])
)

return corpus_paths, outlier_paths


def normalise(streams):
return [
sklearn.preprocessing.MinMaxScaler().fit_transform(stream) for stream in streams
]


class Data:
"""
Hold time-series data and allow augmentations
"""

def __init__(self, if_sample=True, n_samples=(800, 10, 10), random_seed=1):
self.corpus = (
None # unlabelled corpus consists of streams, numpy.array(numpy.array)
)
self.test_inlier = None # test set consists of numpy.array of inliers,
self.test_outlier = None # test set consists of numpy.array of outliers
self.if_sample = if_sample
self.n_samples = n_samples
self.random_seed = random_seed

def sample(self):
random.seed(self.random_seed)
self.corpus = random.choices(list(self.corpus), k=self.n_samples[0])
self.test_inlier = random.choices(list(self.test_inlier), k=self.n_samples[1])
self.test_outlier = random.choices(list(self.test_outlier), k=self.n_samples[2])

def load_pen_digit(self, digit: int = 1):
"""
Load pen digit dataset with a specific digit as training set
:param digit: 0-9, use as "normality" training corpus
:return: None
"""
train_df = pd.read_pickle(DATA_DIR + "pen_digit_train.pkl")
test_df = pd.read_pickle(DATA_DIR + "pen_digit_test.pkl")
self.corpus = train_df[train_df["Digit"] == digit]["Stream"].to_numpy()
self.test_inlier = test_df[test_df["Digit"] == digit]["Stream"].to_numpy()
self.test_outlier = test_df[test_df["Digit"] != digit]["Stream"].to_numpy()

if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_language_data(self):
"""
Load language data set with English and German words.
:return: None
"""
paths = np.load(DATA_DIR + "paths_en_de.npy")
labels = np.load(DATA_DIR + "labels_en_de.npy")
(paths_train, paths_test, labels_train, labels_test) = train_test_split(
paths, labels, random_state=1, test_size=0.2
)
paths_train = paths_train[labels_train == 0]

self.corpus = paths_train
self.test_inlier = paths_test[labels_test == 0]
self.test_outlier = paths_test[labels_test == 1]
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_ship_movements(self, thres_distance=32000, n_samples=5000):
"""
:param thres_distance: Must be one of [4000, 8000, 16000, 32000]
:param n_samples: samples taken for each of the train, test_in and test_out
:return:
"""

# process data in a format where it could be indexed.
def process_data(data_frame):
data_frame["TimeDiff"] = data_frame["BaseDateTime"].apply(
lambda x: np.append(0, np.diff(x))
)
data_frame = data_frame[["LAT", "LON", "TimeDiff"]]
res = []
for i in range(len(data_frame)):
res.append(
np.array(
list(
zip(
data_frame.iloc[i].to_numpy()[0],
data_frame.iloc[i].to_numpy()[1],
data_frame.iloc[i].to_numpy()[2],
)
)
)
)
return res

# subsample data
def sample_data(ais_by_vessel_split, random_state):
return ais_by_vessel_split.sample(
n=n_samples,
weights="SUBSTREAM_WEIGHT",
replace=True,
random_state=random_state,
)

with Path(DATA_DIR + "inlier_mmsis_train.pkl").open("rb") as f:
inlier_mmsis_train = pickle.load(f)
with Path(DATA_DIR + "inlier_mmsis_test.pkl").open("rb") as f:
inlier_mmsis_test = pickle.load(f)
with Path(DATA_DIR + "outlier_mmsis.pkl").open("rb") as f:
outlier_mmsis = pickle.load(f)

if thres_distance not in [4000, 8000, 16000, 32000]:
msg = "thres_distance needs to be in [4000, 8000, 16000, 32000]"
raise ValueError(msg)
ais_by_vessel_split_local = pd.read_pickle(
DATA_DIR + "substreams_" + str(thres_distance) + ".pkl"
)

self.corpus = process_data(
sample_data(
ais_by_vessel_split_local.loc[inlier_mmsis_train],
random_state=self.random_seed,
)
)
self.test_inlier = process_data(
sample_data(
ais_by_vessel_split_local.loc[inlier_mmsis_test],
random_state=self.random_seed,
)
)
self.test_outlier = process_data(
sample_data(
ais_by_vessel_split_local.loc[outlier_mmsis],
random_state=self.random_seed,
)
)
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)

def load_ucr_dataset(
self, data_set_name="Adiac", anomaly_level=0.001, random_state=1
):
"""
:param data_set_name: Must be one of ['Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'CBF',
'ChlorineConcentration', 'Coffee', 'ECG200', 'ECGFiveDays', 'FaceFour',
'GunPoint', 'Ham', 'Herring', 'Lightning2', 'Lightning7', 'Meat',
'MedicalImages', 'MoteStrain', 'Plane', 'Strawberry', 'Symbols',
'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'TwoLeadECG', 'Wafer',
'Wine']
:param anomaly_level: Must be one of [0.001, 0.05]
:param random_state:
:return:
"""
Level_To_COLUMN = {0.001: "Atra", 0.05: "A5tra"}
comparisons = pd.read_csv(
DATA_DIR + "results_beggel_et_al_2019_tables_2_and_4.csv"
)
comparisons = comparisons.set_index("Dataset")
if data_set_name not in comparisons.index:
msg = "data_set_name must be in "
raise ValueError(msg, comparisons.index)
DATASET_PATH = DATA_DIR + "Univariate_arff"
datatrain = arff.loadarff(
Path(DATASET_PATH) / data_set_name / (data_set_name + "_TRAIN.arff")
)
datatest = arff.loadarff(
Path(DATASET_PATH) / data_set_name / (data_set_name + "_TEST.arff")
)
alldata = pd.concat(
[pd.DataFrame(datatrain[0]), pd.DataFrame(datatest[0])], ignore_index=True
)
alldata["target"] = pd.to_numeric(alldata["target"])
corpus_paths, outlier_paths = get_corpus_and_outlier_paths(
alldata, comparisons.loc[data_set_name].normal
)
corpus_train, corpus_test = train_test_split(
corpus_paths,
test_size=comparisons.loc[data_set_name].Ntes.astype("int"),
random_state=random_state,
)
outliers_injection = comparisons.loc[data_set_name][
Level_To_COLUMN[anomaly_level]
].astype("int")
if outliers_injection != 0:
outlier_paths, outlier_paths_to_train = train_test_split(
outlier_paths, test_size=outliers_injection, random_state=random_state
)
corpus_train = (
corpus_train + outlier_paths_to_train
) # injecting anomaly into the corpus

self.corpus = corpus_train
self.test_inlier = corpus_test
self.test_outlier = outlier_paths
if self.if_sample:
self.sample()
self.corpus, self.test_inlier, self.test_outlier = map(
normalise, (self.corpus, self.test_inlier, self.test_outlier)
)
Loading

0 comments on commit ee766b8

Please sign in to comment.