-
Notifications
You must be signed in to change notification settings - Fork 1
/
csv_readers.py
342 lines (287 loc) · 14 KB
/
csv_readers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""Contains classes that read CSV files output by pressure sensors."""
from datetime import datetime
from netCDF_Utils import edit_netcdf
import unit_conversion as uc
import numpy as np
import pandas as pd
import pytz
import re
def find_first(fname, expr):
'''Search for the first occurrence of expr in fname, return the line no.'''
with open(fname, 'r') as text:
for i, line in enumerate(text):
if re.search(expr, line):
return i + 1
class Hobo(edit_netcdf.NetCDFWriter):
'''derived class for hobo csv files '''
def __init__(self):
self.timezone_marker = "time zone"
super(Hobo, self).__init__()
self.date_format_string = '%m/%d/%y %I:%M:%S %p'
self.date_format_string2 = '%m/%d/%Y %H:%M'
def read(self):
'''load the data from in_filename
only parse the initial datetime = much faster
'''
self.get_serial()
second = False
skip_index = find_first(self.in_filename, '"#"')
if skip_index == None:
skip_index = find_first(self.in_filename, '#')
second = True
df = pd.read_table(self.in_filename, skiprows=skip_index, header=None,
engine='c', sep=',', usecols=(1, 2))
df = df.dropna()
try:
first_stamp = uc.datestring_to_ms(df.values[0][0], self.date_format_string, self.tz_info, self.daylight_savings)
second_stamp = uc.datestring_to_ms(df.values[1][0], self.date_format_string, self.tz_info, self.daylight_savings)
except:
first_stamp = uc.datestring_to_ms(df.values[0][0], self.date_format_string2, self.tz_info, self.daylight_savings)
second_stamp = uc.datestring_to_ms(df.values[1][0], self.date_format_string2, self.tz_info, self.daylight_savings)
self.frequency = 1000 / (second_stamp - first_stamp)
try:
start_ms = uc.datestring_to_ms(df[1][0], self.date_format_string, self.tz_info, self.daylight_savings)
except:
start_ms = uc.datestring_to_ms(df[1][0], self.date_format_string2, self.tz_info, self.daylight_savings)
self.utc_millisecond_data = uc.generate_ms(start_ms, df.shape[0], self.frequency)
# if self.daylight_savings == True:
# self.utc_millisecond_data = [x - 3600000 for x in self.utc_millisecond_data]
self.pressure_data = df[2].values * uc.PSI_TO_DBAR
def get_serial(self):
self.instrument_serial = "not found"
with open(self.in_filename, 'r') as text:
for i, line in enumerate(text):
if re.search('[0-9]{6}', line):
match = re.search('[0-9]{6}', line)
self.instrument_serial = match.group(0)
break
class House(edit_netcdf.NetCDFWriter):
'''Processes files coming out of the USGS-made sensors'''
def __init__(self):
self.timezone_marker = "time zone"
self.temperature_data = None
super(House, self).__init__()
self.frequency = 4
self.date_format_string = '%Y.%m.%d %H:%M:%S '
def read(self):
'''Load the data from in_filename'''
skip_index = find_first(self.in_filename, '^[0-9]{4},[0-9]{4}$') - 1
df = pd.read_table(self.in_filename, skiprows=skip_index, header=None,
engine='c', sep=',', names=('a', 'b'))
self.pressure_data = np.array([
uc.USGS_PROTOTYPE_V_TO_DBAR(np.float64(x))
for x in df[df.b.isnull() == False].a])
self.temperature_data = [
uc.USGS_PROTOTYPE_V_TO_C(np.float64(x))
for x in df[df.b.isnull() == False].b]
with open(self.in_filename, 'r') as wavelog:
for x in wavelog:
# second arg has extra space that is unnecessary
if re.match('^[0-9]{4}.[0-9]{2}.[0-9]{2}', x):
start_ms = uc.datestring_to_ms(x, self.date_format_string)
self.utc_millisecond_data = uc.generate_ms(start_ms,
len(self.pressure_data),
self.frequency)
break
class Leveltroll(edit_netcdf.NetCDFWriter):
'''derived class for leveltroll ascii files
'''
def __init__(self):
self.numpy_dtype = np.dtype([("seconds", np.float32),
("pressure", np.float32)])
self.record_start_marker = "date and time,seconds"
self.timezone_marker = "time zone"
super(Leveltroll, self).__init__()
self.date_format_string = "%m/%d/%Y %I:%M:%S %p"
self.temperature_data = None
def read(self):
'''load the data from in_filename
only parse the initial datetime = much faster
'''
self.get_serial()
skip_index = find_first(self.in_filename, 'Date and Time,Seconds')
data = pd.read_table(self.in_filename, skiprows=skip_index, header=None,
engine='c', sep=',', usecols=(0,1,2,3))
self.data_start = uc.datestring_to_ms(data[0][1], self.date_format_string,
self.tz_info, self.daylight_savings)
self.data_start2 = uc.datestring_to_ms(data[0][2], self.date_format_string,
self.tz_info, self.daylight_savings)
self.frequency = 1 / ((self.data_start2 - self.data_start) / 1000)
self.utc_millisecond_data = uc.generate_ms(self.data_start, len(data[0]),
self.frequency)
self.pressure_data = data[3].values * uc.PSI_TO_DBAR
def get_serial(self):
self.instrument_serial = "not found"
with open(self.in_filename, 'r') as text:
for i, line in enumerate(text):
if re.search('Serial Number', line):
match = re.search('[0-9]{6}', line)
self.instrument_serial = match.group(0)
break
class MeasureSysLogger(edit_netcdf.NetCDFWriter):
'''derived class for Measurement Systems cvs files
'''
def __init__(self):
self.timezone_marker = "time zone"
super(MeasureSysLogger, self).__init__()
self.frequency = 4
self.date_format_string = '%m/%d/%Y %I:%M:%S.%f %p'
self.date_format_string2 = '%m/%d/%Y %H:%M:%S.%f'
def read(self):
'''load the data from in_filename
only parse the initial datetime = much faster
'''
self.get_serial()
skip_index = find_first(self.in_filename, '^ID') - 1
# for skipping lines in case there is calibration header data
df = pd.read_table(self.in_filename, skiprows=skip_index + 1, header=None,
engine='c', sep=',', usecols=[3, 4, 5])
try:
self.data_start = uc.datestring_to_ms(df[3][3][1:],
self.date_format_string, self.tz_info, self.daylight_savings)
second_stamp = uc.datestring_to_ms(df[3][4][1:],
self.date_format_string, self.tz_info, self.daylight_savings)
self.frequency = 1000 / (second_stamp - self.data_start)
self.pressure_data = df[5].values * uc.PSI_TO_DBAR
start_ms = uc.datestring_to_ms('%s' % df[3][0][1:], self.date_format_string, self.tz_info, self.daylight_savings)
except:
self.data_start = uc.datestring_to_ms(df[3][3][1:],
self.date_format_string2, self.tz_info, self.daylight_savings)
second_stamp = uc.datestring_to_ms(df[3][4][1:],
self.date_format_string2, self.tz_info, self.daylight_savings)
self.frequency = 1000 / (second_stamp - self.data_start)
self.pressure_data = df[5].values * uc.PSI_TO_DBAR
start_ms = uc.datestring_to_ms('%s' % df[3][0][1:], self.date_format_string2, self.tz_info, self.daylight_savings)
self.utc_millisecond_data = uc.generate_ms(start_ms, df.shape[0], self.frequency)
def get_serial(self):
self.instrument_serial = "not found"
with open(self.in_filename, 'r') as text:
for i, line in enumerate(text):
if line.find('Transducer Serial') > -1:
match = re.search("[0-9]{7}", line)
self.instrument_serial = match.group(0)
break
class RBRSolo(edit_netcdf.NetCDFWriter):
'''derived class for RBR solo engineer text files, (exported via ruskin software)
'''
def __init__(self):
self.timezone_marker = "time zone"
super(RBRSolo, self).__init__()
self.frequency = 4
self.date_format_string = '%d-%b-%Y %H:%M:%S.%f'
def read(self):
'''load the data from in_filename
only parse the initial datetime = much faster
'''
skip_index = find_first(self.in_filename, '^[0-9]{2}-[A-Z]{1}[a-z]{2,8}-[0-9]{4}')
df = pd.read_csv(self.in_filename, skiprows=skip_index, delim_whitespace=True,
header=None, engine='c', usecols=[0, 1, 2])
self.datestart = uc.datestring_to_ms('%s %s' % (df[0][0], df[1][0]), self.date_format_string)
self.utc_millisecond_data = uc.generate_ms(self.datestart, df.shape[0] - 1,
self.frequency)
self.pressure_data = np.array([x for x in df[2][:-1]])
class Waveguage(edit_netcdf.NetCDFWriter):
"""Reads in an ASCII file output by a Waveguage pressure sensor
from Ocean Sensor Systems Inc.
This class reads in data from a plaintext output file into a
pandas Dataframe. This is then translated into numpy ndarrays
and written to a netCDF binary file."""
def __init__(self):
super(Waveguage, self).__init__()
def read(self):
"""Sets start_time to a datetime object, utc_millisecond_data
to a numpy array of dtype=int64 and pressure_data to a numpy
array of dtype float64."""
data = self.get_data()
chunks = self.get_pressure_chunks(data)
timestamps = self.get_times(data)
self.data_start_date = datetime.strftime(timestamps[0], "%Y-%m-%dT%H:%M:%SZ")
self.data_duration_time = timestamps[-1] - timestamps[0]
with open(self.in_filename) as f:
self.frequency = f.readline()[25:27]
self.utc_millisecond_data = self.get_ms_data(timestamps, chunks)
raw_pressure = self.make_pressure_array(timestamps, chunks)
self.pressure_data = raw_pressure * 10.0 + uc.ATM_TO_DBAR
return self.pressure_data, self.utc_millisecond_data
def make_pressure_array(self, t, chunks):
def press_entries(t2, t1):
seconds = (t2 - t1).total_seconds()
return seconds * self.frequency
final = np.zeros(0, dtype=np.float64)
prev_stamp = None
prev_press = None
for stamp, press in zip(t, chunks):
if prev_stamp:
n = press_entries(stamp, prev_stamp) - len(prev_press)
narr = np.zeros(n, dtype=np.float64) + self.fill_value
final = np.hstack((final, prev_press, narr))
prev_stamp = stamp
prev_press = press
final = np.hstack((final, chunks[-1]))
return final
def get_pressure_chunks(self, data):
master = [[]]
i = 0
for e in data:
if e.startswith('+') or e.startswith('-'):
if len(e) == 7:
master[i].append(np.float64(e))
else:
if master[i] != []:
master.append([])
i += 1
master.pop()
return master
def get_ms_data(self, timestamps, chunks):
"""Generates the time data using the initial timestamp in the
file and the length of the pressure data array."""
first_stamp = timestamps[0]
last_stamp = timestamps[-1]
def del_t_ms(t2, t1):
return (t2 - t1).total_seconds() * 1000
total_stamp_ms = del_t_ms(last_stamp, first_stamp)
last_chunk = chunks[-1]
last_chunk_ms = 1000 * len(last_chunk) / self.frequency
total_ms = total_stamp_ms + last_chunk_ms
first_date = timestamps[0]
epoch_start = datetime(year=1970, month=1, day=1, tzinfo=pytz.utc)
offset = (first_date - epoch_start).total_seconds() * 1e3
utc_ms_data = np.arange(total_ms, step=(1000 / self.frequency),
dtype='int64')
utc_ms_data += offset
return utc_ms_data
def _get_frequency(self):
with open(self.in_filename) as f:
line = f.readline()
freq = int(line[25:27])
return freq
def get_times(self, p):
"""Returns the time that the device started reading as a
datetime object."""
def make_stamps(p):
added = ''
result = []
for i, s in enumerate(p):
added += s
if i % 6 == 5:
result.append(added)
added = ''
return result
def test2(x):
return not (x.startswith('+') or x.startswith('-'))
c = p.map(test2)
p = p[c]
p = p[14:-1]
stamps = make_stamps(p)
date_format = 'Y%yM%mD%dH%HM%MS%S'
stamps = [datetime.strptime(stamp, date_format).replace(tzinfo=self.tzinfo)
for stamp in stamps]
return stamps
def get_data(self):
"""Reads the pressure data from the current file and returns
it in a numpy array of dtype float64."""
data = pd.read_csv(self.in_filename, skiprows=0, header=None,
lineterminator=',', sep=',', engine='c',
names='p')
data.p = data.p.apply(lambda x: x.strip())
return data.p