Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update nasa_connector.py #77

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
39 changes: 39 additions & 0 deletions task_geo/dataset_builders/nasa/area_partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np


def area_partition(df_loc):
"""
Find a small number of small bboxes covering all the locations.

Parameters
----------
df_loc : pandas.DataFrame
Need to contain columns 'lat' and 'lon' with the coordinates.

Returns
-------
numpy.Array
Size is (number of boxes, 4).

"""

# location points
unique_locations = df_loc[['lat', 'lon']].drop_duplicates().dropna()

# create new columns with top left corner of small bbox containing the
# location
unique_locations['bottom_left_lat'] = \
np.floor(unique_locations.lat / 4.5) * 4.5
unique_locations['bottom_left_lon'] = \
np.floor(unique_locations.lon / 4.5) * 4.5
unique_locations['top_right_lat'] = \
unique_locations['bottom_left_lat'] + 4.5
unique_locations['top_right_lon'] = \
unique_locations['bottom_left_lon'] + 4.5

bboxes = unique_locations[['bottom_left_lat',
'bottom_left_lon',
'top_right_lat',
'top_right_lon']]

return bboxes.drop_duplicates().values
104 changes: 99 additions & 5 deletions task_geo/dataset_builders/nasa/nasa_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests

from task_geo.dataset_builders.nasa.references import PARAMETERS
from task_geo.dataset_builders.nasa.area_partition import area_partition


def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
Expand Down Expand Up @@ -45,7 +46,88 @@ def nasa_data_loc(lat, lon, str_start_date, str_end_date, parms_str):
return df


def nasa_connector(df_locations, start_date, end_date=None, parms=None):
def nasa_data_area(bbox, str_start_date, str_end_date, parms_list):
"""
Extract data for an area. The area is at most 10x10 degrees, the output is
at 1/2 degrees coordinates.

Parameters
----------
bbox : list
[min lat, min lon, max lat, max lon], half-degrees
max 10x10 degrees
str_start_date : string
str_end_date : string
parms_list : list

Returns
-------
df : pandas.DataFrame

"""
base_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py"

identifier = "identifier=Regional"
parms_str = f"parameters={','.join(parms_list)}"
user_community = "userCommunity=SSE"
temporal_average = "tempAverage=DAILY"
output_format = "outputList=JSON"
user = "user=anonymous"

url = (
f"{base_url}?request=execute&{identifier}&{parms_str}&"
f"startDate={str_start_date}&endDate={str_end_date}&"
f"bbox={str(bbox)[1:-1].replace('. ', '').replace(' ', '')}&"
f"{temporal_average}&{output_format}&"
f"{user_community}&{user}"
)
print(bbox)

response = requests.get(url).json()
data_json = requests.get(response['outputs']['json']).json()
data = [
pd.DataFrame({**{par: data_coord['properties']['parameter'][par]
for par in parms_list},
'lat': data_coord['geometry']['coordinates'][1],
'lon': data_coord['geometry']['coordinates'][0]
}) for data_coord in data_json['features']
]
df = pd.concat(data)
df.reset_index(inplace=True, drop=False)
return df.rename(columns={'index': 'date'})


def match_grid_point(locations, df_data):
"""
Match data from the grid to the single locations.

Parameters
----------
locations : pd.DataFrame
Unique locations.
df_data : pd.DataFrame
The grid data.

Returns
-------
pd.DataFrame
Output dataset.

"""
data = []
for row in locations.itertuples():
lat = 0.5 * round(2 * (row.lat - 0.25)) + 0.25
lon = 0.5 * round(2 * (row.lon - 0.25)) + 0.25
df_loc = df_data[(df_data.lat == lat) & (df_data.lon == lon)].copy()
df_loc.lat = row.lat
df_loc.lon = row.lon

data.append(df_loc)
return pd.concat(data).reset_index(drop=True, inplace=False)


def nasa_connector(df_locations, start_date, end_date=None, parms=None,
precision='area'):
"""Retrieve meteorologic data from NASA.

Given a dataset with columns country, region, sub_region, lon, and lat, for
Expand All @@ -60,6 +142,9 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
end_date(datetime): End date for the time series (optional)
parms(list of strings): Desired data, accepted are 'temperature',
'humidity', and 'pressure' (optional)
precision(string): Either 'area' (deafault) for lower precision but
much faster running time, or 'point' for more
precise but much slower running time.

Return:
------
Expand All @@ -85,7 +170,16 @@ def nasa_connector(df_locations, start_date, end_date=None, parms=None):
all_parms = list(itertools.chain.from_iterable([PARAMETERS[p] for p in parms]))
parms_str = f"parameters={','.join(all_parms)}"

return pd.concat([
nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
for row in locations.itertuples()
])
if precision == 'point':
return pd.concat([
nasa_data_loc(row.lat, row.lon, str_start_date, str_end_date, parms_str)
for row in locations.itertuples()
])
else:
df_data = pd.concat(
[nasa_data_area(list(bbox), str_start_date,
str_end_date, all_parms)
for bbox in area_partition(locations)]
)
df_data.reset_index(drop=True, inplace=True)
return match_grid_point(locations, df_data)