This repository has been archived by the owner on Jun 14, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathodourcollect_utils.py
233 lines (209 loc) · 12.1 KB
/
odourcollect_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# visit https://www.lfd.uci.edu/~gohlke/pythonlibs
# for win64 requirements for KEPLERGL (GDAL and FIONA)
import array
import pandas as pd
import os
from dotenv import load_dotenv
class OCMapConfig:
"""
Returns the contents of <conf_name>.conf in <conf_folder> subdir.
Intended for providing KeplerGL map configs previously stored in files.
"""
def __init__(self, conf_folder='mapconf'):
self.conf_folder = os.path.splitext(os.path.basename(conf_folder))[0]
if os.path.isdir(self.conf_folder):
pass
else:
raise FileNotFoundError(
'\"{}\" should be an existing subfolder, but is missing. '
'Ensure that it exists and contains at least one .conf file.'.format(self.conf_folder))
def get(self, conf_name):
# First, for security, strip off path separators and any bad intentioned stuff passed through confname
conf_name = os.path.splitext(os.path.basename(conf_name))[0]
# Next line should yield something like 'mapconf/mymap.conf'
map_conf_path = os.path.join(self.conf_folder, '%s.conf' % conf_name)
# Now... does it exist?
if os.path.isfile(map_conf_path):
with open(map_conf_path) as conf_file:
conf_contents = conf_file.read()
try:
return eval(conf_contents) # Checks if valid dict or causes SyntaxError
except SyntaxError as e:
raise SyntaxError('Contents of \"{}\" are not a valid dictionary'.format(map_conf_path)) from e
else:
raise FileNotFoundError(
'\"{}.conf\" should be an existing file in \"{}\" subfolder, but is missing. Ensure that you specified '
'the name of a valid config without extension.'.format(self.conf_folder, conf_name))
class GenericOCData:
"""
Loads CSV/XLSX file to a Pandas DataFrame. Fully agnostically.
OCObservationData and OCAnalysisData rely on it
"""
def __init__(self, file_name, file_type='auto'):
# Load CSV or XLSX
# TODO: What if we need to load data directly from a Pandas.DataFrame? This architechture precludes to do so!
self.file_name = file_name # just for information purposes
self.file_type = file_type
self.data = pd.DataFrame
if self.file_type == 'auto':
self.file_type = os.path.splitext(file_name)[1].split('.', -1)[1]
print('INFO: File type autodetection yields: {}'.format(self.file_type))
if self.file_type == 'csv':
self.data = pd.read_csv(file_name, na_filter=False, parse_dates=False, true_values='Yes',
false_values='No', dtype=object)
elif self.file_type == 'xlsx' or self.file_type == 'xls':
# REMEMBER that this only reads the FIRST sheet of excel file!
print('INFO: Remember that we only load data from 1st sheet of MS Excel Files')
self.data = pd.read_excel(file_name, na_filter=False, parse_dates=False, true_values='Yes',
false_values='No', dtype=object)
else:
raise AssertionError('file_type must be "csv" or "xlsx"')
def to_excel(self, filename, **kwargs):
print('Exporting data to file \"{}\"...'.format(filename)) # 1st version of this struct.
self.data.to_excel(filename, **kwargs)
class OCFilterProvider:
"""
INNER/NESTED class inside GenericOCData class, for storing and managing data prep routines (filters)
just right after it has been loaded. It enables the caller to perform data transformations on DataFrames just by
declaring an ordered list of stored routines (called "filters") that will be applied to the DataFrame.
Mainly used for data preparation of OCObservationData and OCAnalysisData DataFrames.
"""
def __init__(self, data_frame, filter_list):
"""
:type data_frame: pd.DataFrame
:type filter_list: list
"""
self.AVAILABLE = set([f for f in dir(self) if not f.startswith('_')])
if not set(filter_list).issubset(self.AVAILABLE):
# ALL filters specified HAVE to be available in this filter provider. Otherwise, raise Exception
raise ValueError('The list of OC data filters specified includes at least one not known.'
'Available filters:\n{}'.format(str(self.AVAILABLE)))
else:
self.filter_list = filter_list
self.__run__(data_frame)
def __run__(self, data_frame):
"""
:type data_frame: pd.DataFrame
:return: pd.DataFrame
"""
for data_filter in self.filter_list:
print('Running filters: \"{}\"...'.format(data_filter))
data_frame = getattr(self, data_filter)(data_frame)
print('Filter execution went OK')
return data_frame
@staticmethod
def fix_typos(ocdata):
# Detects the INTENTITY typo in OC's export files
# Should be filter #1 in OCObservationData
if 'Intentity' in ocdata:
print('INFO: Correcting typo in column "Intensity" until OCs developers fix it someday ;-)')
ocdata.rename(columns={'Intentity': 'Intensity'}, inplace=True) # and correct it
if 'day' in ocdata:
ocdata.rename(columns={'day': 'Day'}, inplace=True) # other minor typo
if 'time' in ocdata:
ocdata.rename(columns={'time': 'Time'}, inplace=True) # another one
return ocdata
@staticmethod
def fix_userids(ocdata):
# Export files generated by non-superadmins don't include user ids.
# This filter checks existence of column 'User'. If missing, injects a blank one to homogenize structure.
# Should be filter #2 in OCObservationData
if 'User' not in ocdata:
print('INFO: Data does NOT include user ids. Generate files as superadmin if you need them.')
ocdata['User'] = ""
return ocdata
@staticmethod
def odour_literals_to_numbers(ocdata):
# Odour intensity and annoyance conversion from literals to numbers
# Should be filter #3 in OCObservationData (relies on #1)
annoy_mapping = {"Extremely unpleasant": "-4",
"Very unpleasant": "-3",
"Unpleasant": "-2",
"Slightly unpleasant": "-1",
"Neutral": "0",
"Slightly pleasant": "1",
"Pleasant": "2",
"Very pleasant": "3",
"Extremely pleasant": "4",
"": "0"} # Empty string should not be there, but just in case
intensity_mapping = {"Extremely strong": "6",
"Very strong": "5",
"Strong": "4",
"Distinct": "3",
"Weak": "2",
"Very weak": "1",
"Not perceptible": "0",
"": "0"} # Empty string should not be there, but just in case
# EXPERIMENTAL: Translating durations from simple categoric useless data to pseudo-timedelta (in minutes)
duration_mapping = {"Continuous odor throughout the day": "1 day 00:00:00",
"Continuous odor in the last hour": "01:00:00",
"Punctual odor": "00:05:00",
"": "00:00:00"}
intensity_and_annoy_mapping = {"Intensity": intensity_mapping,
"Annoy": annoy_mapping,
"Duration": duration_mapping}
ocdata.replace(intensity_and_annoy_mapping, inplace=True)
# And now, some bastards ;)
# These are values that need to be detected/changed because of historic errors in OC.
# "Extremely pleasan" (without t) is a FIX in older format.
# "Moderate (un)pleasant" are similar cases. It's an historic mistake in literals used.
bastard_tone_mapping = {"Moderate unpleasant": "-3",
"Moderate pleasant": "3",
"Extremely pleasan": "4"}
ocdata.replace({"Annoy": bastard_tone_mapping}, inplace=True)
# Finally, we can return the data corrected and prepared
return ocdata
@staticmethod
def add_analyst_fields(ocdata):
# If missing, adds extra columns for data analyst work (respecting original citizen's data)
# The input for this filter should be OCObservationData (a regular OdourCollect observations export)
# Should be filter #4 in OCObservationData
for analystextrafield in ['Typeoverride', 'Subtypeoverride', 'Intensityoverride', 'Annoyoverride',
'Analystcomments']:
if analystextrafield in ocdata:
# It is not supposed for these columns to be already in OCObservationData data file structure.
raise AssertionError('The data provided already has extra fields for data analysts. That means that'
' it\'s not an observations file but an analysis file. Operation cancelled.')
else:
print('Upgrading data structure for analysis. Adding \"{}\" field...'.format(analystextrafield))
ocdata[analystextrafield] = ''
return ocdata
@staticmethod
def type_casting(ocdata):
# Category type casting for the different types of data that may be in the data structure.
# Should be convenient in both OCObservationData and OCAnalysisData data structures.
# Intended to be very last filter to be run.
category_types = ['Type', 'Subtype', 'Zone', 'Status', 'Origin', 'User', 'Intensityoverride',
'Annoyoverride', 'Duration'] # The xxxxoverride fields must allow null, so won't be int
datetime_types = ['Day']
timedelta_types = ['Time', 'Duration']
int_types = ['Intensity', 'Annoy']
for column in ocdata:
if column in category_types:
ocdata[column] = ocdata[column].astype('category')
if column in datetime_types:
ocdata[column] = pd.to_datetime(ocdata[column])
if column in timedelta_types:
# This requires preliminary arrangements. Check odour_literals_to_numbers() in FilterProvider
ocdata[column] = pd.to_timedelta(ocdata[column])/pd.Timedelta('60s')
ocdata[column] = ocdata[column].round()
if column in int_types:
# Yes, this is right. See https://github.com/pandas-dev/pandas/issues/25472
ocdata[column] = ocdata[column].str.replace(' ', '')
ocdata[column] = ocdata[column].astype(str)
ocdata[column] = ocdata[column].astype(int)
class OCObservationData(GenericOCData):
def __init__(self, file_name, file_type='auto'):
super().__init__(file_name, file_type)
self.filters = self.OCFilterProvider(self.data, ['fix_typos', 'fix_userids', 'odour_literals_to_numbers',
'add_analyst_fields', 'type_casting'])
class OCAnalysisData(GenericOCData):
def __init__(self, file_name, file_type='auto'):
super().__init__(file_name, file_type)
self.filters = self.OCFilterProvider(self.data, ['type_casting'])
if __name__ == '__main__':
# If called directly, consider self-test
load_dotenv()
mapbox_api_key = os.getenv('MAPBOX_API_KEY')
test_path = os.getenv('TEST_PATH')
ocmap = OCObservationData(test_path)