-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata.py
110 lines (98 loc) · 6.62 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from glob import glob
from typing import Sequence
import numpy as np
import pandas as pd
import wget
bikaner_coords = [28., 73.3]
jodhpur_coords = [26.3, 73.017]
seattle = [47.4444, -122.3139]
portland = [45.5908, -122.6003]
def download_ghcn_daily_india(years: Sequence):
path_to_directory = os.getenv("INDIA_DATA") or './data'
stations = pd.read_table(f"{path_to_directory}/ghcnd-stations.txt", sep='\s+', usecols=[0, 1, 2, 3, 4, 5],
header=None, names=['STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME'])
stations = stations[((stations.LATITUDE == bikaner_coords[0]) & (stations.LONGITUDE == bikaner_coords[-1]))
| ((stations.LATITUDE == jodhpur_coords[0]) & (stations.LONGITUDE == jodhpur_coords[-1]))]
for year in np.arange(*years, step=1):
target_url = f"https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/{year}.csv.gz"
if not os.path.isfile(f"{path_to_directory}/{year}.csv.gz"):
try:
print(f"Downloading url {target_url} to file {path_to_directory}")
wget.download(target_url, path_to_directory)
data = pd.read_csv(f"{path_to_directory}/{year}.csv.gz", compression='gzip', header=None,
names=["STATION", "DATE", "ELEMENT", "VALUE", "M-FLAG", "Q-FLAG", "S-FLAG", "TIME"])
data = data[(data['ELEMENT'].isin(['TMAX']))].rename(columns={'VALUE': 'TMAX'}).merge(stations,
on='STATION').drop(
columns=['ELEMENT'])
data = data.assign(DATE=pd.to_datetime(data.DATE.astype(str)))
# drop stations for this year when >= 70% unvalid May-June data and drop data with at least 1 quality flag
data = data[data['Q-FLAG'].isna()].assign(YEAR=data.DATE.dt.year).assign(MONTH=data.DATE.dt.month)
count_station_year = data[(data['MONTH'] >= 5) & (data['MONTH'] <= 6)].groupby(['STATION']).TMAX.count()
valid_stations = count_station_year[count_station_year >= 0.7 * 61].index
data = data.set_index(['STATION', 'DATE', 'YEAR']).loc[valid_stations][
['TMAX', 'TIME', 'STATE', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION']]
if len(data):
data.to_csv(f"{path_to_directory}/{year}.csv")
except Exception as err:
print(f"---> Can't access {target_url}: {err}")
else:
print(f"Year {year} has already been downloaded to {path_to_directory}")
def download_ghcn_daily_portland_seattle(years: Sequence):
path_to_directory = os.getenv("CANADA_DATA") or './data'
stations = pd.read_table(f"{path_to_directory}/ghcnd-stations.txt", sep='\s+', usecols=[0, 1, 2, 3, 4, 5],
header=None,
names=['STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME'])
stations = stations[((stations.LATITUDE == seattle[0]) & (stations.LONGITUDE == seattle[-1]))
| ((stations.LATITUDE == portland[0]) & (stations.LONGITUDE == portland[-1]))]
for year in np.arange(*years, step=1):
target_url = f"https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/{year}.csv.gz"
if not os.path.isfile(f"{path_to_directory}/{year}.csv.gz"):
try:
print(f"Downloading url {target_url} to file {path_to_directory}")
wget.download(target_url, path_to_directory)
data = pd.read_csv(f"{path_to_directory}/{year}.csv.gz", compression='gzip', header=None,
names=["STATION", "DATE", "ELEMENT", "VALUE", "M-FLAG", "Q-FLAG", "S-FLAG", "TIME"])
data = data[(data['ELEMENT'].isin(['TMAX']))].rename(columns={'VALUE': 'TMAX'}).merge(stations,
on='STATION').drop(
columns=['ELEMENT'])
data = data.assign(DATE=pd.to_datetime(data.DATE.astype(str)))
data = data[data['Q-FLAG'].isna()].assign(YEAR=data.DATE.dt.year).assign(MONTH=data.DATE.dt.month)
data = data.set_index(['STATION', 'DATE', 'YEAR'])[
['TMAX', 'TIME', 'STATE', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION']]
if len(data):
data.to_csv(f"{path_to_directory}/{year}.csv")
except Exception as err:
print(f"---> Can't access {target_url}: {err}")
else:
print(f"Year {year} has already been downloaded to {path_to_directory}")
def get_ghcn_daily_india_annualmax():
path_to_directory = os.getenv('INDIA_DATA')
csvs = [pd.read_csv(f) for f in glob(f"{path_to_directory}/*.csv")] if path_to_directory is not None else [pd.read_csv("./data/phalodi.csv")]
csvs = [csv.assign(YEAR=pd.to_datetime(csv.DATE).dt.year).set_index(['STATION', 'DATE', 'YEAR']).assign(
TMAX=lambda x: x['TMAX'] / 10) for csv in csvs]
concatenated = pd.concat(csvs)
station_series = concatenated.sort_values('DATE')['TMAX'].unstack('STATION')
annual_max = station_series.groupby(level='YEAR').agg('max')
return annual_max
def process_global_mean_surfacetemp():
path = os.getenv("GLOBAL_MEAN_TEMP_DATA") or "./data/GLB.Ts+dSST.csv"
global_mean_temp = pd.read_csv(path, header=1)[['Year', 'J-D']].rename(
columns={'J-D': 'TEMPANOMALY_GLOB', 'Year': 'YEAR'})
global_mean_temp.YEAR = global_mean_temp.YEAR.astype(int)
global_mean_temp.TEMPANOMALY_GLOB = global_mean_temp.TEMPANOMALY_GLOB.astype(float)
global_mean_temp.YEAR = pd.to_datetime(global_mean_temp.YEAR.astype(str))
global_mean_temp = global_mean_temp.set_index('YEAR').rolling('1461D').mean()
global_mean_temp.index = global_mean_temp.index.year
return global_mean_temp
def get_ghcn_daily_canada_annualmax():
path_to_directory = os.getenv('CANADA_DATA')
csvs = [pd.read_csv(f) for f in glob(f"{path_to_directory}/*.csv")] if path_to_directory is not None else [pd.read_csv("./data/british_columbia.csv")]
csvs = [csv.assign(YEAR=pd.to_datetime(csv.DATE).dt.year).set_index(['STATION', 'DATE', 'YEAR']).assign(
TMAX=lambda x: x['TMAX'] / 10) for csv in csvs]
concatenated = pd.concat(csvs)
station_series = concatenated.sort_values('DATE')['TMAX'].unstack('STATION')
annual_max = station_series.groupby(level='YEAR').agg('max')
global_mean_temp = process_global_mean_surfacetemp()
full_dataset = annual_max.merge(global_mean_temp, left_index=True, right_index=True)
return full_dataset