-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter.py
144 lines (115 loc) · 4.41 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""This module is responsible for filtering the dataset in data/temperatures.tab.
Details can be found in the final report.
By Freeman Cheng and Idris Tarwala.
"""
import math
from dataclasses import dataclass
from typing import List, Tuple, Dict
@dataclass
class Dataset:
"""A dataclass holding dataset filtered by a specific month.
Representation Invariants:
- 1 <= self.month <= 12
- len(self.data) > 0
"""
month: int
data: List[Tuple[str, float]]
def read_dataset(path: str) -> List[Tuple[str, float]]:
"""Given a path, read tab delimited dataset and
return a list of rows
Preconditions:
- len(path) > 0
"""
data = []
with open(path, 'r+') as fp:
for line in fp:
text = line.split('\t')
data.append((text[0], float(text[1])))
return data
def central_tendency(values: List[float]) -> Dict[str, float]:
"""Given a list of floats
return the mean and median
Preconditions:
- len(values) > 0
>>> expected = {'mean': 3.0, 'median': 2.0}
>>> actual = central_tendency([1.0, 6.0, 2.0])
>>> actual == expected
True
>>> expected = {'mean': 13.25, 'median': 12.0}
>>> actual = central_tendency([17.0, 25.0, 4.0, 7.0])
>>> actual == expected
True
"""
v = sorted(values)
ret = {}
n = len(v)
ret['mean'] = sum(v) / n
if n % 2 == 0:
ret['median'] = (v[n // 2 - 1] + v[n // 2]) / 2
else:
ret['median'] = v[(n - 1) // 2]
return ret
def filter_data(dataset: List[Tuple[str, float]]) -> Dict[str, Dataset]:
"""Given a dataset, filter out the hottest, median, and coldest months
return a dictionary containing this information
Preconditions:
- len(dataset) > 0
- int(dataset[0][0].split('-')[1]) == 1
"""
# Precondition assures the first row is a January
avg_temperatures = []
filtered_dataset = []
for month in range(1, 12 + 1):
month_data = Dataset(month, [dataset[i]
for i in range(0, len(dataset))
if i % 12 == month - 1])
filtered_dataset.append(month_data)
temps = [row[1] for row in month_data.data]
stats = central_tendency(temps)
avg_temperatures.append(stats['mean'])
# Now process the information
max_t = max(avg_temperatures)
min_t = min(avg_temperatures)
med_t = central_tendency(avg_temperatures)['median']
# Take month (any if two are same) with max, min, med average temperatures
ret = {'hottest': [filtered_dataset[i] for i in range(0, len(filtered_dataset))
if max_t == avg_temperatures[i]][0],
'coldest': [filtered_dataset[i] for i in range(0, len(filtered_dataset))
if min_t == avg_temperatures[i]][0]}
# If the median is not actually in avg_temperatures (ie. the length is even), then take one of two closest
if len(avg_temperatures) % 2 == 0:
# We actually know this will always be even since 12 months
# central_tendency is a good helper function for other modules, so instead of modifying central_tendency,
# we modify this instead (albeit it's a little crude)
index = 0
diff = math.inf
for i in range(0, len(avg_temperatures)):
if abs(avg_temperatures[i] - med_t) < diff:
diff = abs(avg_temperatures[i] - med_t)
index = i
ret['median'] = filtered_dataset[index]
else:
ret['median'] = [filtered_dataset[i] for i in range(0, len(filtered_dataset))
if med_t == avg_temperatures[i]][0]
return ret
filtered_data = filter_data(read_dataset('temperatures.tab'))
if __name__ == '__main__':
import doctest
import python_ta
import python_ta.contracts
doctest.testmod(verbose=True)
# uncomment this if want to see python_ta report
# python_ta.contracts.DEBUG_CONTRACTS = False
# python_ta.contracts.check_all_contracts()
# python_ta.check_all(config={
# 'extra-imports': [
# 'math',
# 'dataclasses',
# 'python_ta.contracts'
# ], # the names (strs) of imported modules
# 'allowed-io': [
# 'read_dataset'
# ], # the names (strs) of functions that call print/open/input
# 'max-line-length': 150,
# 'disable': ['R1705', 'C0200', 'E9997', 'E9988', 'E9969']
# })