-
Notifications
You must be signed in to change notification settings - Fork 0
/
machine_learning.py
69 lines (51 loc) · 2.37 KB
/
machine_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import random
from typing import TypeVar, List, Tuple
X = TypeVar('X') # generic type to represent a data point
def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
"""Split data into fractions [prob, 1 - prob]"""
data = data[:] # Make a shallow copy
random.shuffle(data) # because shuffle modifies the list.
cut = int(len(data) * prob) # Use prob to find a cutoff
return data[:cut], data[cut:] # and split the shuffled list there.
data = [n for n in range(1000)]
train, test = split_data(data, 0.75)
# The proportions should be correct
assert len(train) == 750
assert len(test) == 250
# And the original data should be preserved (in some order)
assert sorted(train + test) == data
Y = TypeVar('Y') # generic type to represent output variables
def train_test_split(xs: List[X],
ys: List[Y],
test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
# Generate the indices and split them.
idxs = [i for i in range(len(xs))]
train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
return ([xs[i] for i in train_idxs], # x_train
[xs[i] for i in test_idxs], # x_test
[ys[i] for i in train_idxs], # y_train
[ys[i] for i in test_idxs]) # y_test
xs = [x for x in range(1000)] # xs are 1 ... 1000
ys = [2 * x for x in xs] # each y_i is twice x_i
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)
# Check that the proportions are correct
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250
# Check that the corresponding data points are paired correctly.
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))
def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
correct = tp + tn
total = tp + fp + fn + tn
return correct / total
assert accuracy(70, 4930, 13930, 981070) == 0.98114
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
return tp / (tp + fp)
assert precision(70, 4930, 13930, 981070) == 0.014
def recall(tp: int, fp: int, fn: int, tn: int) -> float:
return tp / (tp + fn)
assert recall(70, 4930, 13930, 981070) == 0.005
def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
p = precision(tp, fp, fn, tn)
r = recall(tp, fp, fn, tn)
return 2 * p * r / (p + r)