-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweather_loading.py
456 lines (312 loc) · 14.4 KB
/
weather_loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
import os
import pandas as pd
import warnings
import numpy as np
import re
class MissingDataError(Exception):
pass
def rename_columns(data_ger):
column_names = data_ger.columns.values
data_eng = data_ger.rename(columns = {column_names[0]: 'Station ID',
column_names[1]: 'Date',
column_names[2]: 'Quality Level',
column_names[3]: 'Air Temperature',
column_names[4]: 'Vapor Pressure',
column_names[5]: 'Degree of Coverage',
column_names[6]: 'Air Pressure',
column_names[7]: 'Rel Humidity',
column_names[8]: 'Wind Speed',
column_names[9]: 'Max Air Temp',
column_names[10]: 'Min Air Temp',
column_names[11]: 'Min Groundlvl Temp',
column_names[12]: 'Max Wind Speed',
column_names[13]: 'Precipitation',
column_names[14]: 'Precipitation Ind',
column_names[15]: 'Hrs of Sun',
column_names[16]: 'Snow Depth', })
return data_eng
def clean_dataframe(df):
"""
Cleans the raw weather data (i.e. dropping the eor column, dropping the na
row, making the 'Station ID' type int, replace -999 values by nan,
sorting the dataframe by 'Station ID' and 'Date', making the 'Date' type
string, adding a 'Year', 'Month' and 'Day' column) in the dataframe and
renames the German column to their English equivalent.
INPUT
-----
df : Raw dataframe
OUTPUT
------
df : Clean dataframe
"""
if 'eor' in df:
df=df.drop('eor', 1)
df=df.dropna(axis = 0)
df.iloc[:,0] = int(df.iloc[0,0])
df=rename_columns(df)
df=df.sort(['Station ID', 'Date'])
df=df.replace(to_replace = -999, value = float('nan'))
df['Date']=df['Date'].astype(int).astype(str)
df['Year']=[date[0:4] for date in df['Date']]
df['Month']=[date[4:6] for date in df['Date']]
df['Day']=[date[6:8] for date in df['Date']]
ID_to_citynames, citynames_to_ID = get_cities()
df['City'] = [ID_to_citynames[str(ID).zfill(5)] for ID in df['Station ID']]
return df
def check_for_weather_data(era):
"""
Check if there is data in the 'era' directory below directories 'downloaded_weather'.
INPUT
------
era: string specifying the path to return, either 'recent', 'historical'
OUTPUT
------
not output
"""
if not os.path.isdir('downloaded_data'):
raise OSError("There is no 'downloaded_data' directory.\n You either have to download\
the weather data using 'download_weather_data' or move to the right\
directory.' ")
else:
if not os.path.isdir(os.path.join('downloaded_data',era)):
raise OSError('You dont have the '+era+' data, download it first.')
else:
if os.listdir(os.path.join(os.getcwd(),'downloaded_data',era)) == []:
raise OSError('You dont have the '+era+' data, download it first.')
def check_for_station(ID, era):
"""
Check if there is a station specified by ID for given era.
INPUT
-----
ID : string with 5 digits of specifying station ID
era : string specifying the path to return, either 'recent', 'historical'
OUPUT
-----
no output
"""
txtfilename = get_txtfilename(ID,era)
if txtfilename not in os.listdir(os.path.join(os.getcwd(),'downloaded_data',era)):
raise MissingDataError('There is no station '+ID+' in the '+era+' data.')
def get_txtfilename(ID, era):
""" Return the txtfilename given by station ID and era in correct format."""
return era+'_'+ID+'.txt'
def load_station(ID,era):
"""
Loads the data from one station for given era into a dataframe.
INPUT
-----
ID : string with 5 digits of specifying station ID
era : string specifying the path to return, either 'recent', 'historical'
OUPUT
-----
df : dataframe containing all the data from that station
"""
check_for_weather_data(era)
check_for_station(ID,era)
txtfilename = get_txtfilename(ID,era)
df = pd.read_csv(os.path.join('downloaded_data',era,txtfilename))
df = df.drop(df.columns[0], axis = 1)
return df
def get_timerange(df):
"""
INPUT
------
df: a single dataframe
OUTPUT
------
list with the first and last dates of the data frame [time_from, time_to]"""
timerange = (df.iloc[0,1], df.iloc[-1,1])
return(timerange)
def merge_eras(df_hist, df_rec):
"""
Merges historical with recent data and removes overlapping entries.
INPUT
------
df_hist: Historical data, loaded into a pandas daraframe
df_rec: Recent data, loaded into a pandas daraframe
OUTPUT
------
df_no_overlap: Retuns one timecontinuous datafrom, without duplicates.
"""
df_merged = pd.concat([df_hist,df_rec], axis=0)
df_no_overlap = pd.DataFrame.drop_duplicates(df_merged)
return df_no_overlap
def extract_times(df, time_from, time_to):
df_to = df[df['Date'] <= str(time_to)]
df_from_to = df_to[df_to['Date'] >= str(time_from)]
return df_from_to
def get_cities(filename = os.path.join('downloaded_data','DWD_City_List.txt')):
"""
Reads cities and ids from textfile
INPUT
-----
filename: Filename of the txt-File, where all cities and station IDs are
inside
OUTPUT
------
ID_to_citynames: Dictionary that maps station IDs to citynames
citynfuzzames_to_ID: Dictionary that maps citynames to station IDs
"""
# Attention! The textfile given by the DWD is encoded in Latin-1.
# Python3 uses utf-8 by default, so we have to specify it here.
# In Python2 none of this will work, the open() function doesn't
# even accept encode= as a parameter.
with open(filename, 'rt', encoding='Latin-1') as text_file:
# Read the first two lines, which we don't need.
text_file.readline()
text_file.readline()
ID_to_citynames = {}
citynames_to_ID = {}
for line in text_file:
try:
ID = int(line[:11])
city_name = line[67:108].strip()
ID = str(ID).zfill(5)
ID_to_citynames[ID] = city_name
citynames_to_ID[city_name] = ID
except ValueError:
pass
return ID_to_citynames, citynames_to_ID
def list_station_names():
stringlist, station_dict = get_cities()
return [station for station in station_dict]
def check_multiple_stations(city):
city_list = list_station_names()
Boolean = [city in string for string in city_list]
idx = [k for k,v in enumerate(Boolean) if v == True]
similar_stations = [city_list[i] for i in idx]
return similar_stations
def fuzzymatch(typo_station):
"""
Returns the station-name that best matches the 'typo_station'. If there is
no station-name that is close enough it gives 'None'.
INPUT
------
typo_station: required station-name that was not found in the station-list
OUTPUT
------
station-name that was most likely meant or None if nothing really matches.
"""
stations = list_station_names()
letters = len(typo_station)
stations_beginning = [name[:letters] for name in stations]
accuracy = []
for station in stations_beginning:
station = station.lower()+' '*(letters-len(station))
matching_fraction = sum( [list(typo_station)[i].lower() == list(station)[i].lower() \
for i in range(letters)] ) / letters
if sorted(list(typo_station)) == sorted(list(station)):
anagram_score = 0.2
else:
anagram_score = 0.
accuracy.append(matching_fraction+anagram_score)
accuracy = np.array(accuracy)
accu_max = np.max(accuracy)
if accu_max < 0.8:
return None
else:
return stations[np.argmax(accuracy)]
def load_dataframe(Cities_or_IDs, time_from, time_to, matching_stations = False):
"""
Loops through the list of station IDs and loads the historical and recent
data into dataframes.
INPUT
-----
Cities_or_IDs : list of station IDs (5 digit strings) or corresponding list of cities
time_from : lower bound of the timespan to be returned string format 'yyyymmdd'
time_to : upper bound of the timespan to be returned string format 'yyyymmdd'
matching_stations : Boolean, False if you want a specific stations, True if you want a list
of dataframes for matching cities (e.g. Berlin gives you Berlin-Tempelhof,
Berlin-Alexanderplatz, etc.)
OUTPUT
------
dictionary of time series
"""
#If a single city is entered, put it into a list
if not isinstance( Cities_or_IDs, list):
Cities_or_IDs = [Cities_or_IDs]
ID_to_citynames, citynames_to_ID = get_cities()
#print(citynames_to_ID)
IDs=[]
for string in Cities_or_IDs:
#Getting the mapping dictionaries
#If Cities_or_IDs is the ID
if string.isdigit():
IDs.append(string)
#If Cities_or_IDs is the the city name, mapping to the ID
elif re.sub(r'[?,$,.,!,-]',r'',string).isalpha():
#Convert every city in the list to capitaized first letter
string = string.title()
#If a city was entered correctly it is in the dictionary
if string in citynames_to_ID:
if matching_stations == False:
ID = citynames_to_ID[string]
IDs.append(ID)
if matching_stations == True:
IDs = IDs + [citynames_to_ID[city] for city in check_multiple_stations(string)]
#If it is not in the dictionary try fuzzymatch to find a matching one
else:
string = fuzzymatch(string)
if string is None:
raise TypeError('You did not enter a correct ID or City. Call the'
'function get_cities() to see the mapping dictionaries')
else:
if matching_stations == False:
ID = citynames_to_ID[string]
IDs.append(ID)
if matching_stations == True:
string = string.split('-')[0]
IDs = IDs + [citynames_to_ID[city] for city in check_multiple_stations(string)]
else:
raise TypeError('You did not enter a correct ID or City. Call the'
'function get_cities() to see the mapping dictionaries')
dict_of_stations = {}
for ID in IDs:
try:
current_dfs = {}
timerange = ['99999999', '00000000']
for era in ('recent','historical'):
try:
current_df = load_station(ID, era)
current_df = clean_dataframe(current_df)
(tmin, tmax) = get_timerange(current_df)
timerange = [min(tmin, timerange[0]), max(tmax, timerange[1])]
current_dfs[era] = current_df
except MissingDataError:
print ('There is no '+era+' data for station '+ID)
if not current_dfs:
raise MissingDataError('There is no data at all for station',ID)
if len(current_dfs) > 1:
merged_df = merge_eras(current_dfs['historical'], current_dfs['recent'])
elif 'recent' in current_dfs.keys():
merged_df = current_dfs['recent']
elif 'historical' in current_dfs.keys():
merged_df = current_dfs['historical']
#[time_from_av, time_to_av] = np.clip([int(time_from), time_to],)
# overlap (kind of fine --> Warning)
if (timerange[1] > time_from > timerange[0] and time_to > timerange[1])\
or (time_from < timerange[0] and timerange[0] < time_to < timerange[1])\
or (time_from < timerange[0] and time_to > timerange[1]):
time_from_new = max(timerange[0], time_from)
time_to_new = min(timerange[1], time_to)
warnings.warn('Station {ID}: Only the timerange from {timefrom} to {timeto} could'
' be extracted!'.format(ID = ID, timefrom = time_from_new, timeto = time_to_new))
# nothing's fine
elif (time_from < timerange[0] and time_to < timerange[0]) \
or (time_from > timerange[1] and time_to > timerange[1]):
raise MissingDataError('Station',ID,': For the timerange you have chosen there is '
'no data available!')
else:
time_from_new = time_from
time_to_new = time_to
merged_df = extract_times(merged_df, time_from_new, time_to_new)
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df = merged_df.set_index('Date')
dict_of_stations[ID] = merged_df
except MissingDataError:
print ('There is no data for station '+ID+' or the ID name is missspelled')
return dict_of_stations
'''
if __name__ == '__main__':
df = load_dataframe(['00001','00044'], '1970', '2015')
'''