Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
akarich73 committed Oct 14, 2024
1 parent 3cc9a46 commit 024f9e5
Show file tree
Hide file tree
Showing 11 changed files with 724 additions and 17 deletions.
16 changes: 12 additions & 4 deletions .idea/barra2-dl.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions barra2_dl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import globals, helpers, downloaders
104 changes: 104 additions & 0 deletions barra2_dl/downloaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""
This module contains the barra2 download function(s) .
"""
import requests
from datetime import datetime, timedelta
from pathlib import Path
import calendar
from .helpers import list_months
from .globals import LatLonPoint, LatLonBBox, barra2_index


def download_file(url: str,
folder_path: str | Path,
file_name: str,
create_folder: bool = False) -> None:
"""Download the file from the url and saves it as folder_path/filename.
If the downloads folder does not exist, it will be created due to the create_folder=True argument.
Args:
url: The URL of the file to be downloaded.
folder_path: The path where the file should be saved.
file_name: The name to save the downloaded file as.
create_folder: If True, creates the folder if it does not exist; otherwise, exits if the folder doesn't exist.
Returns:
None
"""
folder = Path(folder_path)
file = folder / file_name

# Check if the folder exists
if not folder.exists():
if create_folder:
folder.mkdir(parents=True)
print(f"The folder '{folder_path}' was created.")
else:
print(f"The folder '{folder_path}' does not exist. Exiting...")
return

# Check if the file already exists
if file.exists():
print(f"The file '{file_name}' already exists in the folder '{folder_path}'.")
else:
# Download the URL to the file
response = requests.get(url)
file.write_bytes(response.content)
print(f"File '{file_name}' has been downloaded to '{folder_path}'.")

return


def barra2_point_downloader(base_url: str,
barra2_var: list,
lat_lon_point: LatLonPoint,
start_datetime: str | datetime,
end_datetime: str | datetime,
fileout_prefix: str,
fileout_folder: str = 'cache',
fileout_type: str = 'csv_file') -> None:
"""Download barra2 data based on the url and variables list
for each month between start and end datetime.
Args:
base_url (str): Use from barra2-dl.globals or set explicitly
barra2_var (list): Use from barra2-dl.globals or set explicitly
lat_lon_point (LatLonPoint: TypedDict): Use custom class for barra2-dl.globals or as Dict{'lat':float, 'lon':float}
start_datetime (str | datetime): Used to define start of inclusive download period
end_datetime (str | datetime): Used to define end of inclusive download period
fileout_prefix (str): Optional prefix for downloaded file. E.g. location reference.
fileout_folder (str): Relative or absolute path for downloaded files
fileout_type (str): Output file option, 'csv_file'
Returns:
Downloaded files into fileout_folder as f'{fileout_prefix}_{var}_{time_start[:10]}_{time_end[:10]}.csv'
Todo:
Add set list of output format options
Change from using os to pathlib
"""

# loop through each variable requested for download as each variable is saved in a separate url
for var in barra2_var:
# loop through each month as each BARRA2 file is saved by month
for date in list_months(start_datetime, end_datetime, freq="MS"):
year = date.year
month = date.month
time_start = date.isoformat() + 'Z'
# Get the number of days in the current month
days_in_month = calendar.monthrange(year, month)[1]
time_end = (date + timedelta(days=days_in_month) + timedelta(hours=-1)).isoformat() + 'Z'

# update thredds_base_url and set as url for request
url = base_url.format(var=var, year=year, month=month)

# add url parameters to base_url
url += f"?var={var}&latitude={lat_lon_point['lat']}&longitude={lat_lon_point['lon']}&time_start={time_start}&time_end={time_end}&accept={fileout_type}"
fileout_name = f'{fileout_prefix}_{var}_{time_start[:10]}_{time_end[:10]}.csv'
folder_path = fileout_folder
download_file(url, folder_path, fileout_name, create_folder=True)

return




62 changes: 62 additions & 0 deletions barra2_dl/globals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
This module contains global or default variables required to download barra2-dl data from thredds.nci.org.au.
"""
from typing import TypedDict

# -----------------------------------------------------------------------------
# CLASSES
# -----------------------------------------------------------------------------

class LatLonPoint(TypedDict):
"""TypedDict to store a point as latitude and longitude.
Attributes:
lat (float): latitude.
lon (float): longitude.
"""
lat: float
lon: float


class LatLonBBox(TypedDict):
"""TypedDict to store a north south east west bounding box by latitude and longitude.
Attributes:
north (float): latitude.
south (float): latitude.
east (float): longitude.
west (float): longitude.
Todo:
Add checks to make sure co-ordinates are correct with respect to each other.
"""
north: float
south: float
east: float
west: float

# -----------------------------------------------------------------------------
# VARIABLES
# -----------------------------------------------------------------------------

# barra2_aus11_extents
barra2_aus11_lat_lon_bbox = LatLonBBox(north=-23.0, west=133.0, east=134.0, south=-24)

# base thredds url for BARRA2 11km 1hour reanalysis data
barra2_aus11_csv_url = ("https://thredds.nci.org.au/thredds/ncss/grid/ob53/output/reanalysis/AUS-11/BOM/ERA5"
"/historical/hres/BARRA-R2/v1/1hr/{var}/latest/"
"{var}_AUS-11_ERA5_historical_hres_BOM_BARRA-R2_v1_1hr_{year}{month:02d}-{year}{month:02d}.nc")

# index for barra2 used to join separate files
barra2_index = ['time', 'station', 'latitude[unit="degrees_north"]', 'longitude[unit="degrees_east"]']

# set list of BARRA2 variables to download default list is eastward wind (ua*), northward wind (va*), and air temperature at 50m (ta50m)
barra2_var_wind_all = ["ua50m", "va50m", "ua100m", "va100m", "ua150m", "va150m", "ta50m"]

# optional limited variables to test
barra2_var_wind_50m = ["ua50m", "va50m", "ta50m"]

# output file format todo add list of output format options
point_output_format = "csv_file"
# grid_output_format = "netcdf3"
164 changes: 164 additions & 0 deletions barra2_dl/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
This module contains helper functions.
"""

import pandas as pd
import fnmatch
from pathlib import Path
from typing import List


def list_months(start_datetime: str, end_datetime: str, freq: str ='MS', **kwargs) -> list:
"""Generate list of months from input start and end datetime for url file loop.
Args:
freq:
start_datetime: str or datetime-like, Left bound for generating dates.
end_datetime: str or datetime-like, Left bound for generating dates.
**kwargs:
Returns:
list
"""
df_to_list = pd.date_range(start=start_datetime, end=end_datetime, freq=freq, **kwargs).tolist()
return df_to_list


def list_csv_files(folder_path):
"""
List all CSV files in the given folder.
Args:
folder_path (str): The path to the folder containing the CSV files.
Returns:
list: A list of CSV file names in the folder.
"""
folder = Path(folder_path)
csv_files = [file.name for file in folder.glob('*.csv')]
return csv_files


def filter_list_using_wildcard(input_list: list[str], pattern:str):
"""
Filter a list using a wildcard pattern.
Args:
input_list (list[str]): The list of strings to be filtered.
pattern (str): The wildcard pattern to filter the list.
Returns:
list: A list of strings that match the wildcard pattern.
"""
filtered_list = fnmatch.filter(input_list, pattern)
return filtered_list


def merge_csv_files_to_dataframe(filein_folder: str,
filename_pattern: str = '*.csv',
index_for_join: str = None) -> pd.DataFrame:
"""
Merge csv files from a folder based on optional filename wildcard using fnmatch.
If filename wildcard is omitted all csv files in the folder will be merged.
If fileout_folder is omitted the merged file will be saved in the filein_folder.
Args:
filein_folder (str): Optional
filename_pattern (str):
index_for_join (str):
Returns:
return_type: None.
Todo:
Change from using os to pathlib
"""

# todo add .csv check for filename_prefix


# list all csv files in folder
csv_files = list_csv_files(filein_folder)

# filter csv files
csv_files_filtered = filter_list_using_wildcard(csv_files, filename_pattern)

# initiate dataframe for combined csv results
df_combined = pd.DataFrame()

for file in Path(filein_folder).glob(filename_pattern):
if df_combined.empty:
# read csv file without indexing to retain time as column for join
df_combined = pd.read_csv(file)
else:
# read next file into new df
df_add = pd.read_csv(file)
# combine on index join if not None, otherwise just concat together
if index_for_join is not None:
df_combined = df_combined.join(df_add.set_index(index_for_join),on=index_for_join)
else:
df_combined = pd.concat([df_combined, df_add], ignore_index = True)

return df_combined


def export_dataframe_to_csv(dataframe: pd.DataFrame,
fileout_folder: str | Path,
fileout_name: str,
create_folder: bool = True) -> None:
"""
Export a DataFrame to a CSV file in the specified folder with the given file name.
Args:
dataframe (pd.DataFrame): The Pandas DataFrame to export.
fileout_folder (str or Path): The path to the folder where the CSV file will be saved.
fileout_name (str): The name of the CSV file to save.
create_folder (bool): If True, creates the folder if it does not exist; otherwise, exits if the folder doesn't exist.
Returns:
Path: The path of the saved CSV file.
"""
fileout_folder = Path(fileout_folder)
# Check if the folder exists
if not fileout_folder.exists():
if create_folder:
fileout_folder.mkdir(parents=True)
print(f"The folder '{fileout_folder}' was created.")
else:
print(f"The folder '{fileout_folder}' does not exist. Exiting...")
return

# Define the full path for the CSV file
fileout_path_name = fileout_folder / fileout_name

# Export the DataFrame to CSV
dataframe.to_csv(fileout_path_name, index=False)

return fileout_path_name


def get_timestamp_range_list(dataframe: pd.DataFrame, timestamp_column: str) -> List[pd.Timestamp]:
"""
Get a list containing the range between the first and last timestamp in the specified column of the DataFrame.
Args:
dataframe (pd.DataFrame): The DataFrame containing the timestamp column.
timestamp_column (str): The name of the timestamp column in the DataFrame.
Returns:
list: A list containing the first and last timestamp.
"""
if timestamp_column not in dataframe.columns:
raise ValueError(f"Column '{timestamp_column}' does not exist in the DataFrame.")

# Ensure the column is of datetime type
dataframe[timestamp_column] = pd.to_datetime(dataframe[timestamp_column])

# Sort the DataFrame by the timestamp column
dataframe = dataframe.sort_values(by=timestamp_column)

# Get the first and last timestamp
first_timestamp = dataframe[timestamp_column].iloc[0]
last_timestamp = dataframe[timestamp_column].iloc[-1]

return [first_timestamp, last_timestamp]
Loading

0 comments on commit 024f9e5

Please sign in to comment.