-
Notifications
You must be signed in to change notification settings - Fork 3
/
SMOTE.py
74 lines (62 loc) · 3.59 KB
/
SMOTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Written with Python ver. 3.6.8; NumPy ver. 1.19.1; Pandas ver. 1.1.1
import math as m
import random as rd
import numpy as np
# Details of argument:
# Format of arg : dataset : [[attr1, attr2, attr3, ... , class], [attr1, attr2, attr3, ... , class], .....]
# Dataset MUST contain like-classes, although users can modify code as a general synthesizer
# size must be in percentage, ie : 50%, 60%, 100%, 200%, 500%
# size = 100% means each data sample is synthesized once. Total synthetic data equals length(dataset)
# size = n*100% means each data sample is synthesized n times, for n >= 1
# for n < 1, only a portion of dataset is selected for one time synthesis
# default size : 100%
# k is the number of nearest neighbors for each data sample, default value, k = 5
# NOTE: No manual segregation of class label and attributes needed, algorithm pre-processes it automatically
def augment(dataset, size=100, k=5):
# need to separate the class label and the numerical data -- pre-processing stage:
data, c_label = [d[0:(len(d) - 1)] for d in dataset], dataset[0][3]
n = size
# if user chooses to synthesize less than the input data size:
if size < 100:
n = int((size/100)*len(data)) # determine the proportion of data samples for synthesis, no rounding needed
ms = list() # create an empty list to receive randomize samples of defined amount
num = 0
while num != n:
ms.append(data.pop(rd.randrange(0, len(data)))) # append data samples and remove them to prevent duplicates
num = num + 1
data = ms
n = 100 # set as 100% for one round of synthesis
# Algorithm for SMOTE:
synthetic = list()
for i in data:
mul = int(n / 100) # integer multiple of the percentage chosen
nn_array = list() # array to store the indices of nearest neighbors for sample "i"
nn_val = list() # temporary list to store euclidean distance value for future comparison
nn_index = list() # temporary store indices for all neighbors of sample "i"
# Computing euclidean distance between sample "i" and the rest of the data samples:
for ind, dt in enumerate(data):
if i == dt:
continue # do not compute nearest neighbors for same data
else:
nn_index.append(ind)
nn_val.append(m.sqrt(sum(np.power(np.subtract(i, dt), 2))))
# this if-condition removes bugs when user purposely define higher k than the available data samples:
if k > len(data):
k = len(data)
# Keep the first few nearest neighbors based on k:
count = 0
while count < k:
nn_array.append(nn_index[nn_val.index(min(nn_val))]) # Record the indices corresponding to nearest neighbor
nn_index.remove(nn_index[nn_val.index(min(nn_val))])
nn_val.remove(min(nn_val))
count = count + 1
# Synthesize data for data sample "i":
while mul != 0:
nn = rd.randrange(0, k) # Randomly select one of the nearest neighbor, integer type
diff = np.subtract(data[nn_array[nn]], i) # Difference between two closely related samples
var = rd.uniform(0, 1) # Generate the variance multiplier, float type
synthetic.append(list(np.add(i, var*np.array(diff)))) # Add sample i with variance vector
mul = mul - 1
# re-process the synthetic data by assigning class label and add it back to main dataset:
sdata = [arr + [c_label] for arr in synthetic]
return dataset + sdata