Initial commit

akarich73 · Oct 14, 2024 · 024f9e5 · 024f9e5
1 parent 3cc9a46
commit 024f9e5
Show file tree

Hide file tree

Showing 11 changed files with 724 additions and 17 deletions.
diff --git a/.idea/barra2-dl.iml b/.idea/barra2-dl.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/barra2_dl/__init__.py b/barra2_dl/__init__.py
@@ -0,0 +1 @@
+from . import globals, helpers, downloaders
diff --git a/barra2_dl/downloaders.py b/barra2_dl/downloaders.py
@@ -0,0 +1,104 @@
+"""
+This module contains the barra2 download function(s) .
+"""
+import requests
+from datetime import datetime, timedelta
+from pathlib import Path
+import calendar
+from .helpers import list_months
+from .globals import LatLonPoint, LatLonBBox, barra2_index
+
+
+def download_file(url: str,
+                  folder_path: str | Path,
+                  file_name: str,
+                  create_folder: bool = False) -> None:
+    """Download the file from the url and saves it as folder_path/filename.
+    If the downloads folder does not exist, it will be created due to the create_folder=True argument.
+    Args:
+        url: The URL of the file to be downloaded.
+        folder_path: The path where the file should be saved.
+        file_name: The name to save the downloaded file as.
+        create_folder: If True, creates the folder if it does not exist; otherwise, exits if the folder doesn't exist.
+    Returns:
+        None
+    """
+    folder = Path(folder_path)
+    file = folder / file_name
+
+    # Check if the folder exists
+    if not folder.exists():
+        if create_folder:
+            folder.mkdir(parents=True)
+            print(f"The folder '{folder_path}' was created.")
+        else:
+            print(f"The folder '{folder_path}' does not exist. Exiting...")
+            return
+
+    # Check if the file already exists
+    if file.exists():
+        print(f"The file '{file_name}' already exists in the folder '{folder_path}'.")
+    else:
+        # Download the URL to the file
+        response = requests.get(url)
+        file.write_bytes(response.content)
+        print(f"File '{file_name}' has been downloaded to '{folder_path}'.")
+
+    return
+
+
+def barra2_point_downloader(base_url: str,
+                            barra2_var: list,
+                            lat_lon_point: LatLonPoint,
+                            start_datetime: str | datetime,
+                            end_datetime: str | datetime,
+                            fileout_prefix: str,
+                            fileout_folder: str = 'cache',
+                            fileout_type: str = 'csv_file') -> None:
+    """Download barra2 data based on the url and variables list
+    for each month between start and end datetime.
+
+    Args:
+        base_url (str): Use from barra2-dl.globals or set explicitly
+        barra2_var (list): Use from barra2-dl.globals or set explicitly
+        lat_lon_point (LatLonPoint: TypedDict): Use custom class for barra2-dl.globals or as Dict{'lat':float, 'lon':float}
+        start_datetime (str | datetime): Used to define start of inclusive download period
+        end_datetime (str | datetime): Used to define end of inclusive download period
+        fileout_prefix (str): Optional prefix for downloaded file. E.g. location reference.
+        fileout_folder (str): Relative or absolute path for downloaded files
+        fileout_type (str): Output file option, 'csv_file'
+
+    Returns:
+        Downloaded files into fileout_folder as f'{fileout_prefix}_{var}_{time_start[:10]}_{time_end[:10]}.csv'
+
+    Todo:
+        Add set list of output format options
+        Change from using os to pathlib
+
+    """
+
+    # loop through each variable requested for download as each variable is saved in a separate url
+    for var in barra2_var:
+        # loop through each month as each BARRA2 file is saved by month
+        for date in list_months(start_datetime, end_datetime, freq="MS"):
+            year = date.year
+            month = date.month
+            time_start = date.isoformat() + 'Z'
+            # Get the number of days in the current month
+            days_in_month = calendar.monthrange(year, month)[1]
+            time_end = (date + timedelta(days=days_in_month) + timedelta(hours=-1)).isoformat() + 'Z'
+
+            # update thredds_base_url and set as url for request
+            url = base_url.format(var=var, year=year, month=month)
+
+            # add url parameters to base_url
+            url += f"?var={var}&latitude={lat_lon_point['lat']}&longitude={lat_lon_point['lon']}&time_start={time_start}&time_end={time_end}&accept={fileout_type}"
+            fileout_name = f'{fileout_prefix}_{var}_{time_start[:10]}_{time_end[:10]}.csv'
+            folder_path = fileout_folder
+            download_file(url, folder_path, fileout_name, create_folder=True)
+
+    return
+
+
+
+
diff --git a/barra2_dl/globals.py b/barra2_dl/globals.py
@@ -0,0 +1,62 @@
+"""
+This module contains global or default variables required to download barra2-dl data from thredds.nci.org.au.
+"""
+from typing import TypedDict
+
+# -----------------------------------------------------------------------------
+# CLASSES
+# -----------------------------------------------------------------------------
+
+class LatLonPoint(TypedDict):
+    """TypedDict to store a point as latitude and longitude.
+
+    Attributes:
+        lat (float): latitude.
+        lon (float): longitude.
+
+    """
+    lat: float
+    lon: float
+
+
+class LatLonBBox(TypedDict):
+    """TypedDict to store a north south east west bounding box by latitude and longitude.
+
+        Attributes:
+            north (float): latitude.
+            south (float): latitude.
+            east (float): longitude.
+            west (float): longitude.
+
+        Todo:
+        Add checks to make sure co-ordinates are correct with respect to each other.
+        """
+    north: float
+    south: float
+    east: float
+    west: float
+
+# -----------------------------------------------------------------------------
+# VARIABLES
+# -----------------------------------------------------------------------------
+
+# barra2_aus11_extents
+barra2_aus11_lat_lon_bbox = LatLonBBox(north=-23.0, west=133.0, east=134.0, south=-24)
+
+# base thredds url for BARRA2 11km 1hour reanalysis data
+barra2_aus11_csv_url = ("https://thredds.nci.org.au/thredds/ncss/grid/ob53/output/reanalysis/AUS-11/BOM/ERA5"
+                        "/historical/hres/BARRA-R2/v1/1hr/{var}/latest/"
+                        "{var}_AUS-11_ERA5_historical_hres_BOM_BARRA-R2_v1_1hr_{year}{month:02d}-{year}{month:02d}.nc")
+
+# index for barra2 used to join separate files
+barra2_index = ['time', 'station', 'latitude[unit="degrees_north"]', 'longitude[unit="degrees_east"]']
+
+# set list of BARRA2 variables to download default list is eastward wind (ua*), northward wind (va*), and air temperature at 50m (ta50m)
+barra2_var_wind_all = ["ua50m", "va50m", "ua100m", "va100m", "ua150m", "va150m", "ta50m"]
+
+# optional limited variables to test
+barra2_var_wind_50m = ["ua50m", "va50m", "ta50m"]
+
+# output file format todo add list of output format options
+point_output_format = "csv_file"
+# grid_output_format = "netcdf3"
diff --git a/barra2_dl/helpers.py b/barra2_dl/helpers.py
@@ -0,0 +1,164 @@
+"""
+This module contains helper functions.
+"""
+
+import pandas as pd
+import fnmatch
+from pathlib import Path
+from typing import List
+
+
+def list_months(start_datetime: str, end_datetime: str, freq: str ='MS', **kwargs) -> list:
+    """Generate list of months from input start and end datetime for url file loop.
+
+    Args:
+        freq:
+        start_datetime: str or datetime-like, Left bound for generating dates.
+        end_datetime: str or datetime-like, Left bound for generating dates.
+        **kwargs:
+
+    Returns:
+        list
+    """
+    df_to_list = pd.date_range(start=start_datetime, end=end_datetime, freq=freq, **kwargs).tolist()
+    return df_to_list
+
+
+def list_csv_files(folder_path):
+    """
+    List all CSV files in the given folder.
+
+    Args:
+        folder_path (str): The path to the folder containing the CSV files.
+
+    Returns:
+        list: A list of CSV file names in the folder.
+    """
+    folder = Path(folder_path)
+    csv_files = [file.name for file in folder.glob('*.csv')]
+    return csv_files
+
+
+def filter_list_using_wildcard(input_list: list[str], pattern:str):
+    """
+    Filter a list using a wildcard pattern.
+
+    Args:
+        input_list (list[str]): The list of strings to be filtered.
+        pattern (str): The wildcard pattern to filter the list.
+
+    Returns:
+        list: A list of strings that match the wildcard pattern.
+    """
+    filtered_list = fnmatch.filter(input_list, pattern)
+    return filtered_list
+
+
+def merge_csv_files_to_dataframe(filein_folder: str,
+                    filename_pattern: str = '*.csv',
+                    index_for_join: str = None) -> pd.DataFrame:
+    """
+    Merge csv files from a folder based on optional filename wildcard using fnmatch.
+    If filename wildcard is omitted all csv files in the folder will be merged.
+    If fileout_folder is omitted the merged file will be saved in the filein_folder.
+
+    Args:
+        filein_folder (str): Optional
+        filename_pattern (str):
+        index_for_join (str):
+
+    Returns:
+        return_type: None.
+
+    Todo:
+        Change from using os to pathlib
+    """
+
+    # todo add .csv check for filename_prefix
+
+
+    # list all csv files in folder
+    csv_files = list_csv_files(filein_folder)
+
+    # filter csv files
+    csv_files_filtered = filter_list_using_wildcard(csv_files, filename_pattern)
+
+    # initiate dataframe for combined csv results
+    df_combined = pd.DataFrame()
+
+    for file in Path(filein_folder).glob(filename_pattern):
+        if df_combined.empty:
+            # read csv file without indexing to retain time as column for join
+            df_combined = pd.read_csv(file)
+        else:
+            # read next file into new df
+            df_add = pd.read_csv(file)
+            # combine on index join if not None, otherwise just concat together
+            if index_for_join is not None:
+                df_combined = df_combined.join(df_add.set_index(index_for_join),on=index_for_join)
+            else:
+                df_combined = pd.concat([df_combined, df_add], ignore_index = True)
+
+    return df_combined
+
+
+def export_dataframe_to_csv(dataframe: pd.DataFrame,
+                            fileout_folder: str | Path,
+                            fileout_name: str,
+                            create_folder: bool = True) -> None:
+    """
+    Export a DataFrame to a CSV file in the specified folder with the given file name.
+
+    Args:
+        dataframe (pd.DataFrame): The Pandas DataFrame to export.
+        fileout_folder (str or Path): The path to the folder where the CSV file will be saved.
+        fileout_name (str): The name of the CSV file to save.
+        create_folder (bool): If True, creates the folder if it does not exist; otherwise, exits if the folder doesn't exist.
+
+    Returns:
+        Path: The path of the saved CSV file.
+    """
+    fileout_folder = Path(fileout_folder)
+    # Check if the folder exists
+    if not fileout_folder.exists():
+        if create_folder:
+            fileout_folder.mkdir(parents=True)
+            print(f"The folder '{fileout_folder}' was created.")
+        else:
+            print(f"The folder '{fileout_folder}' does not exist. Exiting...")
+            return
+
+    # Define the full path for the CSV file
+    fileout_path_name = fileout_folder / fileout_name
+
+    # Export the DataFrame to CSV
+    dataframe.to_csv(fileout_path_name, index=False)
+
+    return fileout_path_name
+
+
+def get_timestamp_range_list(dataframe: pd.DataFrame, timestamp_column: str) -> List[pd.Timestamp]:
+    """
+    Get a list containing the range between the first and last timestamp in the specified column of the DataFrame.
+
+    Args:
+        dataframe (pd.DataFrame): The DataFrame containing the timestamp column.
+        timestamp_column (str): The name of the timestamp column in the DataFrame.
+
+    Returns:
+        list: A list containing the first and last timestamp.
+    """
+    if timestamp_column not in dataframe.columns:
+        raise ValueError(f"Column '{timestamp_column}' does not exist in the DataFrame.")
+
+    # Ensure the column is of datetime type
+    dataframe[timestamp_column] = pd.to_datetime(dataframe[timestamp_column])
+
+    # Sort the DataFrame by the timestamp column
+    dataframe = dataframe.sort_values(by=timestamp_column)
+
+    # Get the first and last timestamp
+    first_timestamp = dataframe[timestamp_column].iloc[0]
+    last_timestamp = dataframe[timestamp_column].iloc[-1]
+
+    return [first_timestamp, last_timestamp]