-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEvent.py
226 lines (182 loc) · 9.42 KB
/
Event.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
#Import the packages
import pandas as pd
import glob
import os
import numpy as np
import scipy.stats as scs
from scipy.stats import pearson3, skew, norm
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
class EVENT:
def __init__(self, file_path, datapath, state, event_type):
self.file_path = file_path
self.datapath = datapath
self.state = state
self.event_type = event_type
# self.station_id = station_id
def load_files(self):
print('Loading files...')
all_files = glob.glob(os.path.join(self.file_path , "*.csv"))
return all_files
def load_data(self, all_files):
print('Loading data from files...')
temp_files = []
for filename in tqdm(all_files):
data_files = pd.read_csv(filename, index_col=None, header=0)
temp_files.append(data_files)
all_flow = pd.concat(temp_files, axis=0, ignore_index=True)
all_flow = all_flow.dropna() # removing the missing values
all_flow = all_flow[(all_flow['USGS_flow']>0)]
np.min(all_flow['USGS_flow'])
#Change the datetime in the dataframe
all_flow['Datetime'] = pd.to_datetime(all_flow['Datetime'])
all_flow.dtypes
all_flow['USGS_flow']
return all_flow
def site_id(self, file_path, all_files):
print('Getting site ids...')
#Find the station ID
site_id = []
# Loop through the file paths and extract the site code
for file_path in all_files:
# Split the file path into directory path and filename
dir_path, file_name = os.path.split(file_path)
# Split the filename into the site code and the file extension
site_id1, ext = os.path.splitext(file_name)
# Extract the site code from the site code string
site_id.append(site_id1.split("_")[0])
return site_id
def site_code(self, site_id):
print('Fixing applicable site codes...')
#Site_code = site_id (station id) without the unnecessary zeros at the beginning
#(select the meaning numbers only, e.g., 968674521 instead of 00968674521)
site_code = []
for i in range(len(site_id)):
site = str(site_id[i])
if site[0] == '0':
site_code.append(site[-7:]) #Change it with the following line depending on the station ID length
else:
site_code.append(site[-8:]) #Change it with the above line depending on the station ID length
site = []
for i in range(len(site_code)):
site.append(int(site_code[i]))
site_code = site
range(len(site_code))
return site_code
# def flood_values(self, all_flow, site_code):
# print('Calculating flood values...')
# #Select annual maximum streamflow from the entire dataset (all_flow) for each site_code, calculate exceedance probability,and
# # Calculate return period
# flood_values = {}
# m= 0
# # Subset the DataFrame based on the values in the 'city' column
# for i in range(len(site_code)):
# subset = all_flow.loc[all_flow['USGS_ID'] == site_code[i]]
# subset['USGS_flow'] = pd.to_numeric(subset['USGS_flow'], errors='coerce') #Uncomment this line if the following line fails
# # max_data = subset.groupby(subset.Datetime.dt.year)['USGS_flow'].max() # Selected by year
# max_data = subset.loc[subset.groupby(subset['Datetime'].dt.year)['USGS_flow'].idxmax().values] # Selected by date of occurance
# max_data = max_data[["Datetime", "USGS_flow"]]
# max_data = max_data.reset_index(drop=True)
# sort_data = max_data.sort_values('USGS_flow', ascending = True)
# sorted_data = sort_data['USGS_flow']
# sorted_date = sort_data['Datetime']
# #Calculate exceedance probability
# Pr = []
# Tp = []
# for j in range(1,len(sorted_data)+1):
# Pr.append((j/(len(sorted_data)+1))*100)
# for n in range(len(Pr)):
# Tp.append(1/(Pr[n]/100))
# #Save the above results as a dictionary
# dict_list = [{'Date': sorted_date}, {'Yearly max': sorted_data}, {'Exceedance probability': Pr}, {'Return periods': Tp}]
# k = site_code[m]
# key = f'dict_{k}'
# flood_values[key] = dict_list
# m +=1
# return flood_values
def flood_values(self, all_flow, site_code, event_type):
print(f'Calculating {event_type} values...')
# Select either annual maximum or minimum streamflow from the entire dataset (all_flow) for each site_code,
# calculate exceedance probability, and calculate return period
flood_values = {}
m = 0
# Subset the DataFrame based on the values in the 'city' column
for i in range(len(site_code)):
# subset = all_flow.loc[all_flow['USGS_ID'] == site_code[i]
# subset['USGS_flow'] = pd.to_numeric(subset['USGS_flow'], errors='coerce')
subset = all_flow.loc[all_flow['USGS_ID'] == site_code[i]]
subset['USGS_flow'] = pd.to_numeric(subset['USGS_flow'], errors='coerce') #Uncomment this line if the following line fails
if event_type == 'flood':
max_data = subset.loc[subset.groupby(subset['Datetime'].dt.year)['USGS_flow'].idxmax().values] # Selected by date of occurrence
elif event_type == 'drought':
max_data = subset.loc[subset.groupby(subset['Datetime'].dt.year)['USGS_flow'].idxmin().values] # Selected by date of occurrence
else:
raise ValueError("Invalid value_type. Use 'flood' or 'drought'.")
max_data = max_data[["Datetime", "USGS_flow"]]
max_data = max_data.reset_index(drop=True)
if event_type == 'flood':
sort_data = max_data.sort_values('USGS_flow', ascending=True)
elif event_type == 'drought':
sort_data = max_data.sort_values('USGS_flow', ascending=True)
sorted_data = sort_data['USGS_flow']
sorted_date = sort_data['Datetime']
# Calculate exceedance probability
Pr = []
Tp = []
for j in range(1, len(sorted_data) + 1):
Pr.append((j / (len(sorted_data) + 1)) * 100)
for n in range(len(Pr)):
Tp.append(1 / (Pr[n] / 100))
# Save the above results as a dictionary
dict_list = [{'Date': sorted_date}, {'Yearly max': sorted_data}, {'Exceedance probability': Pr}, {'Return periods': Tp}]
k = site_code[m]
key = f'dict_{k}'
flood_values[key] = dict_list
m += 1
return flood_values
def df_flood_events(self, flood_values):
print('Getting flood events...')
#Prepare the dataframe from the above huge dictionary to save as a csv file for each state containing all stations (site_id)
# create an empty dataframe
df = pd.DataFrame(columns=['Station', 'Date', 'Yearly max', 'Exceedance probability', 'Return periods'])
# iterate over the keys and values of the dictionary
for station, data in flood_values.items():
# get the yearly max data from the dictionary
yearly_max = data[1]['Yearly max']
# create a new dataframe with the yearly max data
df_new = pd.DataFrame({'Yearly max': yearly_max[::-1]})
# df_new = pd.DataFrame({'Yearly max': yearly_max})
# add the station name to the dataframe
site = str(station)
numeric_site = ''.join(filter(str.isdigit, site)) # Filter out non-numeric characters
if len(numeric_site) == 7:
new_station = numeric_site #Change it with the following line depending on the station ID length
else:
new_station = numeric_site[-8:] #Change it with the above line depending on the station ID length
df_new['Station'] = new_station
# get the other data from the dictionary
exceedance_prob = data[2]['Exceedance probability']
return_periods = data[3]['Return periods']
date = data[0]['Date']
# add the other data to the dataframe
df_new['Exceedance probability'] = exceedance_prob
df_new['Return periods'] = return_periods
df_new['Date'] = date
# append the new dataframe to the main dataframe
df = df.append(df_new, ignore_index=True)
df_flood_events = df
df_flood_events.to_csv(f"{self.datapath}/LULC_Streamflow_SA/ROSET-AWS/SEED-ROSET/SEED_data/df_flood_events_{self.state}.csv", index=False)
return df_flood_events
def station_data(self, site_code):
print('Loading station data...')
#Read the csv just constructed using above command
df = pd.read_csv(f"{self.datapath}/LULC_Streamflow_SA/ROSET-AWS/SEED-ROSET/SEED_data/df_flood_events.csv")
#Select rows from the imported csv file corresponding to the station id
station_rows = df[df['Station'] == site_code[1]]
station_data = station_rows['Yearly max']
return station_data