This repository has been archived by the owner on May 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
utilities.py
71 lines (58 loc) · 2.75 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import glob
import os
import shutil
import time
import constants
import numpy as np
from typing import Any
from sklearn.model_selection import GridSearchCV
def create_feature_file():
"""
Creates the compound feature file, which is a file containing every single subjects' generated features compiled
into one file
:return: None
"""
subject_data = glob.glob(f"synth_data/extracted_features_seq_{constants.SEQUENCE_LENGTH}_2/*.csv")
subject_data.sort() # glob lacks reliable ordering, so impose your own if output order matters
# Check if the feature file exists already to avoid accidental overwrites. If the user doesn't answer
# 'y' or 'yes', exit the script.
# if os.path.exists(constants.FEATURE_FILE):
# answer = input(f"WARNING! {constants.FEATURE_FILE} exists. Do you want to overwrite? [Y/n]: ")
#
# if answer.lower() is not "y" or answer.lower() is not "yes":
# print(f"k: {answer}")
# # We can just return early, and do nothing
# return
with open(constants.FEATURE_FILE, 'wb') as outfile:
for i, csv in enumerate(subject_data):
with open(csv, 'rb') as infile:
# If we are not in the first file, skip the header line
if i != 0:
infile.readline()
if i > 1:
outfile.write(b'\n')
print(f"copying subject {csv}")
# Block copy rest of file from input to output without parsing
shutil.copyfileobj(infile, outfile)
print(f"Finished. {constants.FEATURE_FILE} created.")
def find_best_classifier(model: str, classifier: Any, param_grid: dict[str, Any], X_train: np.ndarray,
y_train: np.ndarray) -> Any:
"""
:param model: The name of the model, only for debug messages
:param classifier: The classifier to base the GridSearch off of
:param param_grid: A dictionary of hyperparameters and their possible values
:param X_train: Feature training data
:param y_train: Feature classifier data
:return: The classifier with the best set of hyperparameters found. Those hyperparameters are also printed out.
"""
print(f"> Starting GridSearch for {model}")
gridsearch = GridSearchCV(classifier, param_grid, cv=constants.CROSS_VALIDATION_STEPS, n_jobs=constants.N_JOBS,
verbose=constants.VERBOSE)
start_time = time.time()
gridsearch.fit(X_train, y_train)
print(f"> {model} GridSearch time: {round(time.time() - start_time, constants.NUM_ROUNDING)}s")
best_params = gridsearch.best_params_
print(f"> Best parameters for {model} were: {best_params}")
return gridsearch.best_estimator_
if __name__ == "__main__":
create_feature_file()