Skip to content

Commit

Permalink
create_test_data
Browse files Browse the repository at this point in the history
  • Loading branch information
nkshaw23 committed Oct 24, 2024
1 parent 704340f commit b2f1cf2
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 0 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"mypy_boto3_s3",
"moto",
"polars",
"s3fs",
"tenacity",
]
name = "dri-utils"
Expand Down
222 changes: 222 additions & 0 deletions src/driutils/test_data/create_test_cosmos_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
"""Script to generate test cosmos data.
Currently used for benchmarking duckdb queries.
Data created per minute for user defined sites and date range.
Can be exported into three different s3 bucket structures:
1) Original format (no partitioning): /YYYY-MM/YYYY-MM-DD.parquet
2) Current format (partitioned by date): /date=YYYY-MM-DD/data.parquet
3) Proposed format (partitioned by date and site): /sire=site/date=YYYY-MM-DD/data.parquet
As discussed, use case for loading from multiple dataset types
(precip, soilmet) unlikely due to different resolutions.
So assuming we will just be querying one dataset at a time.
Notes:
You need to have an aws-vault session running to connect to s3
You (might) need extended permissions to write the test data to s3.
"""

import datetime
import random
from datetime import date, datetime, timedelta
from typing import Optional, Tuple, Union

import duckdb
import polars as pl
import s3fs


def steralize_dates(
start_date: Union[date, datetime], end_date: Optional[Union[date, datetime]]
) -> Tuple[Union[date, datetime], datetime]:
"""
Configures and validates start and end dates.
Args:
start_date: The start date.
end_date: The end date.
Returns:
A tuple containing the start date and the end date.
Raises:
UserWarning: If the start date is after the end date.
"""
# Ensure the start_date is not after the end_date
if start_date > end_date:
raise UserWarning(f"Start date must come before end date: {start_date} > {end_date}")

# If start_date is of type date, convert it to datetime with time at start of the day
if isinstance(start_date, date):
start_date = datetime.combine(start_date, datetime.min.time())

# If end_date is of type date, convert it to datetime to include the entire day
if isinstance(end_date, date):
end_date = datetime.combine(end_date, datetime.max.time())

return start_date, end_date


def write_parquet_s3(bucket: str, key: str, data: pl.DataFrame) -> None:
# Write parquet to s3
fs = s3fs.S3FileSystem()
destination = f"s3://{bucket}/{key}"
# with fs.open(destination, mode="wb") as f:
# data.write_parquet(f)


def build_test_precip_data(
start_date: date, end_date: date, interval: timedelta, sites: list, schema: pl.Schema
) -> pl.DataFrame:
"""
Builds test cosmos data.
For each site, and for each datetime object at the specified interval between
the start and end date, random data is generated. The dataframe is initialised with
the supplied schema, which is taken from the dataset for which you want to create
test data.
Args:
start_date: The start date.
end_date: The end date.
interval: Interval to seperate datetime objects between the start and end date
sites: cosmos sites
schema: required schema
Returns:
A dataframe of random test data.
"""
# Create empty dataframe with the required schema
test_data = pl.DataFrame(schema=schema)

# Format dates
start_date, end_date = steralize_dates(start_date, end_date)

# Build datetime range series
datetime_range = pl.datetime_range(start_date, end_date, interval, eager=True).alias("time")

# Attach each datetime to each site
array = {"time": [], "SITE_ID": []}

for site in sites:
array["time"].append(datetime_range)
array["SITE_ID"].append(site)

date_site_data = pl.DataFrame(array).explode("time")

test_data = pl.concat([test_data, date_site_data], how="diagonal")

# Number of required rows
required_rows = test_data.select(pl.len()).item()

# Update rest of the columns with random values
# Remove cols already generated
schema.pop("time")
schema.pop("SITE_ID")

for column, dtype in schema.items():
if isinstance(dtype, pl.Float64):
col_values = pl.Series(column, [random.uniform(1, 50) for i in range(required_rows)])

if isinstance(dtype, pl.Int64):
col_values = pl.Series(column, [random.randrange(1, 255, 1) for i in range(required_rows)])

test_data.replace_column(test_data.get_column_index(column), col_values)

return test_data


def export_test_data(bucket: str, data: pl.DataFrame, structure: str = "partitioned_date") -> None:
"""Export the test data.
Data can be exported to various s3 structures:
'date': cosmos/dataset_type/YYYY-MM/YYYY-MM-DD.parquet (original format)
'date_partitioned': cosmos/dataset_type/date=YYYY-MM-DD/data.parquet (current format)
'date_site_partitioned': cosmos/dataset_type/site=site/date=YYYY-MM-DD/data.parquet (proposed format)
Args:
bucket: Name of the s3 bucket
data: Test data to be exported
structure: s3 structure. Defaults to date_partitioned (current structure)
Raises:
ValueError if invalid structure string is provided.
"""
# Save out in required structure
# Validate user input
valid_structures = ["date", "partitioned_date", "partitioned_date_site"]
if structure not in valid_structures:
raise ValueError(f"Incorrect structure arguement entered; should be one of {valid_structures}")

groups = [(group[0][0], group[1]) for group in data.group_by(pl.col("time").dt.date())]

for date_obj, df in groups:
if structure == "date":
day = date_obj.strftime("%Y-%m-%d")
month = date_obj.strftime("%Y-%m")
key = f"cosmos/dataset=PRECIP_1MIN_2024_LOOPED/{month}/{day}.parquet"

print(df)

write_parquet_s3(bucket, key, df)

if structure == "partitioned_date":
day = date_obj.strftime("%Y-%m-%d")
key = f"cosmos/dataset=PRECIP_1MIN_2024_LOOPED/date={day}/data.parquet"

print(df)

write_parquet_s3(bucket, key, df)

if structure == "partitioned_date_site":
groups = [(group[0][0], group[1]) for group in df.group_by(pl.col("SITE_ID"))]

for site, site_df in groups:
day = date_obj.strftime("%Y-%m-%d")
key = f"cosmos/dataset=PRECIP_1MIN_2024_LOOPED/site={site}/date={day}/data.parquet"

print(site_df)

write_parquet_s3(bucket, key, site_df)


if __name__ == "__main__":
# Setup basic duckdb connection
conn = duckdb.connect()

conn.execute("""
INSTALL httpfs;
LOAD httpfs;
SET force_download = true;
SET enable_profiling = query_tree;
""")

# Add s3 connection details
conn.execute("""
CREATE SECRET aws_secret (
TYPE S3,
PROVIDER CREDENTIAL_CHAIN,
CHAIN 'sts'
);
""")

# Load single file to get list of unique sites, and the dataset schema
bucket = "ukceh-fdri-staging-timeseries-level-0"
key = "cosmos/dataset=PRECIP_1MIN_2024_LOOPED/date=2024-01-01/*.parquet"

query = f"""SELECT * FROM read_parquet('s3://{bucket}/{key}', hive_partitioning=false)"""
df = conn.execute(query).pl()

sites = set(df.get_column("SITE_ID"))
schema = df.schema

# Build test data
test_data = build_test_precip_data(date(2024, 3, 28), date(2024, 3, 29), timedelta(minutes=1), sites, schema)

# Export test data based on required s3 structure
export_test_data(bucket, test_data, "partitioned_date_site")

0 comments on commit b2f1cf2

Please sign in to comment.