-
Notifications
You must be signed in to change notification settings - Fork 0
/
log_reg_small.py
76 lines (64 loc) · 2.76 KB
/
log_reg_small.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Logistic Regression Baseline for IMDB Dataset.
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from log_reg_util_small import create_dictionary, transform_text
import util
import matplotlib
from sklearn.linear_model import LogisticRegression
def log_reg(X_train, y_train, X_valid, y_valid, X_test, y_test, min_freq, step_size=0.01, max_iter=1000000, eps=1e-5):
print("Started")
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print("Scaled train")
theta = np.zeros(X_train.shape[1])
valid_acc = None
for i in range(max_iter):
preds_val = 1 / (1 + np.exp(-np.dot(X_valid, theta)))
preds_val = [1 if x >= 0.5 else 0 for x in preds_val]
valid_acc_new = metrics.accuracy_score(y_valid, preds_val)
if valid_acc != None and valid_acc_new - valid_acc < 0:
step_size *= 0.9
elif valid_acc != None and valid_acc_new - valid_acc < eps:
break
valid_acc = valid_acc_new
orig_theta = theta.copy()
g_thetax = 1 / (1 + np.exp(-np.dot(X_train, theta)))
update = np.dot(X_train.T, g_thetax - y_train) / X_train.shape[0]
theta -= step_size * update
if np.linalg.norm(theta - orig_theta) < eps:
break
#clf_lr = LogisticRegression(random_state=0, class_weight = 'balanced')
#clf_lr.fit(X_train, y_train)
print("Fit")
predictions = 1 / (1 + np.exp(-np.dot(X_test, theta)))
predictions = [1 if x >= 0.5 else 0 for x in predictions]
#predictions = clf_lr.predict(X_test)
print("Predict")
accuracy = metrics.accuracy_score(y_test, predictions)
f1_score = metrics.f1_score(y_test, predictions)
print(f'Minimum word frequency = {min_freq}, accuracy = {accuracy}, f1_score = {f1_score}')
def main():
# Example of few-shot learning with IMDB and Rotten Tomatoes
train_path = "imdb/imdb_data_train.csv"
train_path_few = "rotten_tomatoes/rotten_tomatoes_train.csv"
valid_path = "imdb/imdb_data_dev.csv"
test_path = "imdb/imdb_data_test.csv"
#train_reviews, train_labels = util.load_sentiment_dataset(train_path) #ZERO-SHOT
train_reviews, train_labels = util.load_sentiment_dataset_few(train_path, train_path_few) #FEW-SHOT
valid_reviews, valid_labels = util.load_sentiment_dataset(valid_path)
test_reviews, test_labels = util.load_sentiment_dataset(test_path)
min_frequencies = [50, 40, 30, 20]
for freq in min_frequencies:
dictionary = create_dictionary(train_reviews, freq)
print('Size of dictionary: ', len(dictionary))
train_matrix = transform_text(train_reviews, dictionary)
valid_matrix = transform_text(valid_reviews, dictionary)
test_matrix = transform_text(test_reviews, dictionary)
print("Transformed")
log_reg(train_matrix, train_labels, valid_matrix, valid_labels, test_matrix, test_labels, freq)
if __name__ == "__main__":
main()