-
Notifications
You must be signed in to change notification settings - Fork 0
/
self_features.py
225 lines (213 loc) · 7.8 KB
/
self_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import init
import pandas as pd
import numpy as np
import re
import random
from dateutil.parser import parse as parse_date
model = init.model
unit_dict = {"万": 10000, "亿": 100000000, "萬": 10000, "億": 100000000, "K+": 1000, "M+": 1000000, "B+": 1000000000}
def load_table(filepath):
"""
Loads the data from the given filepath.
"""
df = pd.read_csv(filepath)
return df
def strict_numeric(data_list,verbose=False):
"""
Checks if the given data is numeric.
"""
cnt = 0
for x in data_list:
try:
y = float(x)
if verbose:
print(x)
print(y)
cnt += 1
except:
continue
if cnt >= 0.95*len(data_list):
return True
return False
def mainly_numeric(data_list):
"""
Checks if the given data list is mostly numeric.
"""
cnt = 0
for data in data_list:
data = str(data)
data = data.replace(",", "")
for unit in unit_dict.keys():
data = data.replace(unit, "")
numeric_part = re.findall(r'\d+', data)
if len(numeric_part) > 0 and sum(len(x) for x in numeric_part) >= 0.5*len(data):
cnt += 1
if cnt >= 0.9*len(data_list):
return True
return False
def extract_numeric(data_list):
"""
Extracts numeric part(including float) from string list
"""
try:
data_list = [float(d) for d in data_list]
except:
pass
numeric_part = []
unit = []
for data in data_list:
data = str(data)
data = data.replace(",", "")
numeric_part.append(re.findall(r'([-]?([0-9]*[.])?[0-9]+)', data))
this_unit = 1
for unit_key in unit_dict.keys():
if unit_key in data:
this_unit = unit_dict[unit_key]
break
unit.append(this_unit)
numeric_part = [x for x in numeric_part if len(x) > 0]
if len(numeric_part) != len(data_list):
print(f"Warning: extract_numeric() found different number of numeric part({len(numeric_part)}) and data list({len(data_list)})")
numeric_part = [float(x[0][0])*unit[i] for i,x in enumerate(numeric_part)]
return numeric_part
def numeric_features(data_list):
"""
Extracts numeric features from the given data. Including Mean,Min, Max, Variance, Standard Deviation,
and the number of unique values.
"""
mean = np.mean(data_list)
min = np.min(data_list)
max = np.max(data_list)
variance = np.var(data_list)
cv = np.var(data_list)/mean
unique = len(set(data_list))
return np.array([mean, min, max, variance,cv, unique/len(data_list)])
def is_url(data_list):
"""
Checks if the given data is in URL format.
"""
cnt = 0
for data in data_list:
if type(data) != str:
continue
if re.search(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', data):
cnt += 1
if cnt >= 0.9*len(data_list):
return True
return False
def is_date(data_list):
"""
Checks if the given data is in Date format.
"""
cnt = 0
for data in data_list:
if type(data) != str:
continue
if "月" in data or "日" in data or "年" in data:
cnt += 1
try:
date = parse_date(data)
# check if the date is near to today
if date.year < 2000 or date.year > 2030:
continue
cnt += 1
except:
continue
if cnt >= 0.9*len(data_list):
return True
return False
def character_features(data_list):
"""
Extracts character features from the given data.
"""
# Ratio of whitespace to length
# Ratio of punctuation to length
# Ratio of special characters to length
punctuations = [",",".",";","!","?",",","。",";","!","?"]
special_characters = ["/","/","\\","-","_","+","=","*","&","^","%","$","#","@","~","`","(",")","[","]","{","}","<",">","|","'","\""]
whitespace_ratios = []
punctuation_ratios = []
special_character_ratios = []
numeric_ratios = []
for data in data_list:
whitespace_ratio = (data.count(" ") + data.count("\t") + data.count("\n"))/len(data)
punctuation_ratio = sum(1 for x in data if x in punctuations)/len(data)
special_character_ratio = sum(1 for x in data if x in special_characters)/len(data)
numeric_ratio = sum(1 for x in data if x.isdigit())/len(data)
whitespace_ratios.append(whitespace_ratio)
punctuation_ratios.append(punctuation_ratio)
special_character_ratios.append(special_character_ratio)
numeric_ratios.append(numeric_ratio)
epilson = np.array([1e-12]*len(data_list))
whitespace_ratios = np.array(whitespace_ratios + epilson)
punctuation_ratios = np.array(punctuation_ratios + epilson)
special_character_ratios = np.array(special_character_ratios + epilson)
numeric_ratios = np.array(numeric_ratios + epilson)
return np.array([np.mean(whitespace_ratios), np.mean(punctuation_ratios), np.mean(special_character_ratios), np.mean(numeric_ratios),
np.var(whitespace_ratios)/np.mean(whitespace_ratios), np.var(punctuation_ratios)/np.mean(punctuation_ratios),
np.var(special_character_ratios)/np.mean(special_character_ratios), np.var(numeric_ratios)/np.mean(numeric_ratios)])
def deep_embedding(data_list):
"""
Extracts deep embedding features from the given data using sentence-transformers.
"""
if len(data_list) < 20:
selected_data = data_list
else:
selected_data = random.sample(data_list,20)
embeddings = [model.encode(str(data)) for data in selected_data]
embeddings = np.array(embeddings)
return np.mean(embeddings, axis=0)
def extract_features(data_list):
"""
Extract some features from the given data(column) or list
"""
data_list = [d for d in data_list if d == d and d != "--"]
data_types = ("url","numeric","date","string")
# Classify the data's type, URL or Date or Numeric
if is_url(data_list):
data_type = "url"
elif is_date(data_list):
data_type = "date"
elif strict_numeric(data_list) or mainly_numeric(data_list):
data_type = "numeric"
else:
data_type = "string"
# Make data type feature one hot encoding
data_type_feature = np.zeros(len(data_types))
data_type_feature[data_types.index(data_type)] = 1
# Give numeric features if the data is mostly numeric
if data_type == "numeric":
data_numeric = extract_numeric(data_list)
num_fts = numeric_features(data_numeric)
else:
num_fts = np.array([-1]*6)
# If data is not numeric, give length features
length_fts = numeric_features([len(str(d)) for d in data_list])
# Give character features and deep embeddings if the data is string
if data_type == "string" or (not strict_numeric(data_list) and mainly_numeric(data_list)):
char_fts = character_features(data_list)
deep_fts = deep_embedding(data_list)
else:
char_fts = np.array([-1]*8)
deep_fts = np.array([-999]*768)
output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts, deep_fts))
return output_features
def make_self_features_from(table_df):
"""
Extracts features from the given table path and returns a feature table.
"""
features = None
for column in table_df.columns:
if "Unnamed:" in column:
continue
fts = extract_features(table_df[column])
fts = fts.reshape(1, -1)
if features is None:
features = fts
else:
features = np.concatenate((features, fts), axis=0)
return features
if __name__ == '__main__':
features = make_self_features_from("Test Data/0archive/Table2.csv")
print(features)
print(features.shape)