From 59a849bd3b79cf49a056bd0ec81478cd402852ac Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 23 Mar 2024 20:55:21 +0530 Subject: [PATCH 01/16] Simplest version of the SST Demo. --- demo/sst | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 demo/sst diff --git a/demo/sst b/demo/sst new file mode 100644 index 0000000..6e38b6e --- /dev/null +++ b/demo/sst @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +"""Demo of calculating global average sea surface temperature (SST) with SQL. + +Please run the following to access the ERA5 dataset: +``` +gcloud auth application-default login +``` +""" +import xarray as xr +import xarray_sql as qr + +# TODO(alxmrs): Add coiled or dask cluster code. + +era5_ds = xr.open_zarr( + 'gs://gcp-public-data-arco-era5/ar/' + '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', + chunks={'time': 240, 'level': 1} +) +print('dataset opened.') +# TODO(alxmrs): Slice to small time range based on script args. +era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( + level=1000, # surface level only. +) + +# chunk sizes determined from VM memory limit of 16 GB. +c = qr.Context() +c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) + +print('beginning query.') +df = c.sql(""" +SELECT + DATE("time") as date, + AVG("sea_surface_temperature") as daily_avg_sst +FROM + "era5" +GROUP BY + DATE("time") +""") + +# TODO(alxmrs): time slice should be in file name. +df.to_csv('global_avg_sst_*.cvs') \ No newline at end of file From 40ac2c80c5cee1c4a2181fc5465b57d6518732fc Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 23 Mar 2024 21:17:09 +0530 Subject: [PATCH 02/16] Added coiled cluster config. --- demo/sst | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/demo/sst b/demo/sst index 6e38b6e..178bfa4 100644 --- a/demo/sst +++ b/demo/sst @@ -9,7 +9,16 @@ gcloud auth application-default login import xarray as xr import xarray_sql as qr -# TODO(alxmrs): Add coiled or dask cluster code. +from coiled import Cluster + +cluster = Cluster( + region='us-central1', + worker_memory='16 GiB', + spot_policy='spot_with_fallback', + arm=True, +) +client = cluster.get_client() +cluster.adapt(minimum=1, maximum=100) era5_ds = xr.open_zarr( 'gs://gcp-public-data-arco-era5/ar/' @@ -22,7 +31,7 @@ era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( level=1000, # surface level only. ) -# chunk sizes determined from VM memory limit of 16 GB. +# chunk sizes determined from VM memory limit of 16 GiB. c = qr.Context() c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) From b29719a2ed1d7c1face0763e62802b497330aa06 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 23 Mar 2024 22:50:39 +0530 Subject: [PATCH 03/16] Added script arguments. --- demo/sst | 50 --------------------------------------- demo/sst.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 +++ 3 files changed, 66 insertions(+), 50 deletions(-) delete mode 100644 demo/sst create mode 100755 demo/sst.py diff --git a/demo/sst b/demo/sst deleted file mode 100644 index 178bfa4..0000000 --- a/demo/sst +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -"""Demo of calculating global average sea surface temperature (SST) with SQL. - -Please run the following to access the ERA5 dataset: -``` -gcloud auth application-default login -``` -""" -import xarray as xr -import xarray_sql as qr - -from coiled import Cluster - -cluster = Cluster( - region='us-central1', - worker_memory='16 GiB', - spot_policy='spot_with_fallback', - arm=True, -) -client = cluster.get_client() -cluster.adapt(minimum=1, maximum=100) - -era5_ds = xr.open_zarr( - 'gs://gcp-public-data-arco-era5/ar/' - '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', - chunks={'time': 240, 'level': 1} -) -print('dataset opened.') -# TODO(alxmrs): Slice to small time range based on script args. -era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( - level=1000, # surface level only. -) - -# chunk sizes determined from VM memory limit of 16 GiB. -c = qr.Context() -c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) - -print('beginning query.') -df = c.sql(""" -SELECT - DATE("time") as date, - AVG("sea_surface_temperature") as daily_avg_sst -FROM - "era5" -GROUP BY - DATE("time") -""") - -# TODO(alxmrs): time slice should be in file name. -df.to_csv('global_avg_sst_*.cvs') \ No newline at end of file diff --git a/demo/sst.py b/demo/sst.py new file mode 100755 index 0000000..1482ec5 --- /dev/null +++ b/demo/sst.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Demo of calculating global average sea surface temperature (SST) with SQL. + +Please run the following to set up cloud resources: +``` +gcloud auth application-default login +coiled setup +``` +""" +import argparse +import xarray as xr +import xarray_sql as qr + +parser = argparse.ArgumentParser() +parser.add_argument('--start', type=str, default='2020-01-01', help='start time ISO string') +parser.add_argument('--end', type=str, default='2020-01-02', help='end time ISO string') +parser.add_argument('--cluster', action='store_true', help='deploy on coiled cluster') + +args = parser.parse_args() + +if args.cluster: + from coiled import Cluster + + cluster = Cluster( + region='us-central1', + worker_memory='16 GiB', + spot_policy='spot_with_fallback', + arm=True, + ) + client = cluster.get_client() + cluster.adapt(minimum=1, maximum=100) +else: + from dask.distributed import LocalCluster + cluster = LocalCluster(processes=False) + client = cluster.get_client() + +era5_ds = xr.open_zarr( + 'gs://gcp-public-data-arco-era5/ar/' + '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', + chunks={'time': 240, 'level': 1} +) +print('dataset opened.') +era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( + time=slice(args.start, args.end), + level=1000, # surface level only. +) + +c = qr.Context() +# chunk sizes determined from VM memory limit of 16 GiB. +c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) + +print('beginning query.') +df = c.sql(""" +SELECT + DATE("time") as date, + AVG("sea_surface_temperature") as daily_avg_sst +FROM + "era5" +GROUP BY + DATE("time") +""") + +df.to_csv(f'global_avg_sst_{args.start}-{args.end}_*.cvs') diff --git a/pyproject.toml b/pyproject.toml index 2d6eeb9..2242507 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,9 @@ dev = [ "pyink", "py-spy" ] +demo = [ + "coiled" +] [project.urls] Homepage = "https://github.com/alxmrs/xarray-sql" From e63ac4e90a15ab573abb826bcd848c12c1e11063 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sun, 24 Mar 2024 16:45:12 +0530 Subject: [PATCH 04/16] SST demo works with local fake data. --- demo/sst.py | 94 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 19 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 1482ec5..6b83e75 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -11,10 +11,58 @@ import xarray as xr import xarray_sql as qr + +def local_data(start: str, end: str) -> xr.Dataset: + import numpy as np + import pandas as pd + + np.random.seed(42) + + lat = np.linspace(-90, 90, num=720) + lon = np.linspace(-180, 180, num=1440) + time = pd.date_range(start, end, freq='H') + level = np.array([1000, 500], dtype=np.int32) + reference_time = pd.Timestamp(start) + + temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level)) + precipitation = 10 * np.random.rand(720, 1440, len(time), len(level)) + + return xr.Dataset( + data_vars=dict( + sea_surface_temperature=( + ['lat', 'lon', 'time', 'level'], + temperature, + ), + precipitation=(['lat', 'lon', 'time', 'level'], precipitation), + ), + coords=dict( + lat=lat, + lon=lon, + time=time, + level=level, + reference_time=reference_time, + ), + attrs=dict(description='Random weather.'), + ) + + parser = argparse.ArgumentParser() -parser.add_argument('--start', type=str, default='2020-01-01', help='start time ISO string') -parser.add_argument('--end', type=str, default='2020-01-02', help='end time ISO string') -parser.add_argument('--cluster', action='store_true', help='deploy on coiled cluster') +parser.add_argument( + '--start', type=str, default='2020-01-01', help='start time ISO string' +) +parser.add_argument( + '--end', type=str, default='2020-01-02', help='end time ISO string' +) +parser.add_argument( + '--cluster', + action='store_true', + help='deploy on coiled cluster, default: local cluster', +) +parser.add_argument( + '--fake', + action='store_true', + help='use local dummy data, default: ARCO-ERA5 data', +) args = parser.parse_args() @@ -22,27 +70,32 @@ from coiled import Cluster cluster = Cluster( - region='us-central1', - worker_memory='16 GiB', - spot_policy='spot_with_fallback', - arm=True, + region='us-central1', + worker_memory='16 GiB', + spot_policy='spot_with_fallback', + arm=True, ) client = cluster.get_client() cluster.adapt(minimum=1, maximum=100) else: from dask.distributed import LocalCluster + cluster = LocalCluster(processes=False) client = cluster.get_client() -era5_ds = xr.open_zarr( - 'gs://gcp-public-data-arco-era5/ar/' - '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', - chunks={'time': 240, 'level': 1} -) +if args.fake: + era5_ds = local_data(args.start, args.end).chunk({'time': 240, 'level': 1}) +else: + era5_ds = xr.open_zarr( + 'gs://gcp-public-data-arco-era5/ar/' + '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', + chunks={'time': 240, 'level': 1}, + ) + print('dataset opened.') era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( - time=slice(args.start, args.end), - level=1000, # surface level only. + time=slice(args.start, args.end), + level=1000, # surface level only. ) c = qr.Context() @@ -50,14 +103,17 @@ c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) print('beginning query.') -df = c.sql(""" +# TODO(alxmrs): `DATE` function is not supported in Apache Calcite out-of-the-box. +df = c.sql( + """ SELECT - DATE("time") as date, + "time", AVG("sea_surface_temperature") as daily_avg_sst FROM "era5" GROUP BY - DATE("time") -""") + "time" +""" +) -df.to_csv(f'global_avg_sst_{args.start}-{args.end}_*.cvs') +df.to_csv(f'global_avg_sst_{args.start}_to_{args.end}_*.cvs') From f39acf25e7d78bb7c87c3158828a0c87e16ddc36 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Thu, 28 Mar 2024 11:45:17 +0530 Subject: [PATCH 05/16] Renamed a method; added memory-optimized cluster option. --- demo/sst.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 6b83e75..0fe1274 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -12,7 +12,7 @@ import xarray_sql as qr -def local_data(start: str, end: str) -> xr.Dataset: +def rand_wx(start: str, end: str) -> xr.Dataset: import numpy as np import pandas as pd @@ -58,6 +58,11 @@ def local_data(start: str, end: str) -> xr.Dataset: action='store_true', help='deploy on coiled cluster, default: local cluster', ) +parser.add_argument( + '--memory-opt-cluster', + action='store_true', + help='deploy on memory-optimized coiled cluster, default: local cluster', +) parser.add_argument( '--fake', action='store_true', @@ -75,8 +80,21 @@ def local_data(start: str, end: str) -> xr.Dataset: spot_policy='spot_with_fallback', arm=True, ) + client = cluster.get_client() cluster.adapt(minimum=1, maximum=100) +elif args.mem_opt_cluster: + from coiled import Cluster + + cluster = Cluster( + region='us-central1', + spot_policy='spot_with_fallback', + worker_vm_types=['m3-ultramem-32'], + arm=True, + ) + + client = cluster.get_client() + cluster.adapt(minimum=1, maximum=50) else: from dask.distributed import LocalCluster @@ -84,7 +102,7 @@ def local_data(start: str, end: str) -> xr.Dataset: client = cluster.get_client() if args.fake: - era5_ds = local_data(args.start, args.end).chunk({'time': 240, 'level': 1}) + era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1}) else: era5_ds = xr.open_zarr( 'gs://gcp-public-data-arco-era5/ar/' From 6256487a0a3619e5d1b729f40fa1c5ae83f980e7 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Thu, 28 Mar 2024 14:08:16 +0530 Subject: [PATCH 06/16] Details focused updates. - Using the v3 ARCO-ERA5 dataset that has the full range of data. - Looking up VM instance types to see what's appropriate - Choosing chunks based on resource and dataset size math. - Writing output to parquet when running on a cluster. --- demo/sst.py | 79 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 0fe1274..78b8b6a 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -8,21 +8,20 @@ ``` """ import argparse + +import numpy as np import xarray as xr import xarray_sql as qr -def rand_wx(start: str, end: str) -> xr.Dataset: - import numpy as np - import pandas as pd - +def rand_wx(start_time: str, end_time: str) -> xr.Dataset: + """Produce a random ARCO-ERA5-like weather dataset.""" np.random.seed(42) lat = np.linspace(-90, 90, num=720) lon = np.linspace(-180, 180, num=1440) - time = pd.date_range(start, end, freq='H') + time = xr.date_range(start_time, end_time, freq='H') level = np.array([1000, 500], dtype=np.int32) - reference_time = pd.Timestamp(start) temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level)) precipitation = 10 * np.random.rand(720, 1440, len(time), len(level)) @@ -40,18 +39,22 @@ def rand_wx(start: str, end: str) -> xr.Dataset: lon=lon, time=time, level=level, - reference_time=reference_time, ), attrs=dict(description='Random weather.'), ) +def tfmt(time: np.datetime64, unit='h') -> str: + """Returns a bucket-friendly date string from a numpy datetime.""" + return np.datetime_as_string(time, unit=unit).replace(':', '') + + parser = argparse.ArgumentParser() parser.add_argument( - '--start', type=str, default='2020-01-01', help='start time ISO string' + '--start', type=str, default='1940-01-01', help='start time ISO string' ) parser.add_argument( - '--end', type=str, default='2020-01-02', help='end time ISO string' + '--end', type=str, default='1940-01-02', help='end time ISO string' ) parser.add_argument( '--cluster', @@ -76,25 +79,23 @@ def rand_wx(start: str, end: str) -> xr.Dataset: cluster = Cluster( region='us-central1', - worker_memory='16 GiB', spot_policy='spot_with_fallback', - arm=True, + worker_mv_types=['t2a-standard-16'], # 4 GiBs RAM per CPU, ARM. ) client = cluster.get_client() cluster.adapt(minimum=1, maximum=100) elif args.mem_opt_cluster: - from coiled import Cluster + from coiled import Cluster - cluster = Cluster( - region='us-central1', - spot_policy='spot_with_fallback', - worker_vm_types=['m3-ultramem-32'], - arm=True, - ) + cluster = Cluster( + region='us-central1', + spot_policy='spot_with_fallback', + worker_vm_types=['m3-ultramem-32'], # 30.5 GiBs RAM per CPU, x86. + ) - client = cluster.get_client() - cluster.adapt(minimum=1, maximum=50) + client = cluster.get_client() + cluster.adapt(minimum=1, maximum=25) else: from dask.distributed import LocalCluster @@ -105,20 +106,37 @@ def rand_wx(start: str, end: str) -> xr.Dataset: era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1}) else: era5_ds = xr.open_zarr( - 'gs://gcp-public-data-arco-era5/ar/' - '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2', + 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/', chunks={'time': 240, 'level': 1}, ) + assert np.datetime64(args.start) >= np.datetime64( + '1940-01-01' + ), 'ARCO-ERA5 does not go back before 1940-01-01!' + + assert ( + np.datetime64(args.end) <= era5_ds.time[-1].values + ), f'ARCO-ERA5 does not run until {args.end}!' + print('dataset opened.') -era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( + +era5_sst_ds = era5_ds.sel( time=slice(args.start, args.end), level=1000, # surface level only. -) +).sea_surface_temperature + +print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs') c = qr.Context() -# chunk sizes determined from VM memory limit of 16 GiB. -c.create_table('era5', era5_sst_ds, chunks=dict(time=24)) +# `time=48` produces 190 MiB chunks +# `time=96` produces 380 MiB chunks +# `time=192` produces 760 MiB chunks +# `time=240` produces 950 MiB chunks +# `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU. +time_chunks = 96 # four day chunks. +if args.mem_opt_cluster: + time_chunks = 720 # one month chunks. +c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks)) print('beginning query.') # TODO(alxmrs): `DATE` function is not supported in Apache Calcite out-of-the-box. @@ -134,4 +152,11 @@ def rand_wx(start: str, end: str) -> xr.Dataset: """ ) -df.to_csv(f'global_avg_sst_{args.start}_to_{args.end}_*.cvs') +# Store the results for visualization later on. +start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values) +now = tfmt(np.datetime64('now'), 's') +results_name = f'global_avg_sst_{start}_to_{end}.{now}' +if args.cluster or args.mem_opt_cluster: + df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/') +else: + df.to_csv(results_name + '_*.csv') From 9dfaebfbd30b88bb6cddaa6a6c112e16f5ff39ca Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 13:21:21 +0530 Subject: [PATCH 07/16] Fixed issues found with fake data. --- demo/sst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 78b8b6a..99f5134 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -85,7 +85,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: client = cluster.get_client() cluster.adapt(minimum=1, maximum=100) -elif args.mem_opt_cluster: +elif args.memory_opt_cluster: from coiled import Cluster cluster = Cluster( @@ -120,10 +120,10 @@ def tfmt(time: np.datetime64, unit='h') -> str: print('dataset opened.') -era5_sst_ds = era5_ds.sel( +era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( time=slice(args.start, args.end), level=1000, # surface level only. -).sea_surface_temperature +) print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs') @@ -134,7 +134,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: # `time=240` produces 950 MiB chunks # `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU. time_chunks = 96 # four day chunks. -if args.mem_opt_cluster: +if args.memory_opt_cluster: time_chunks = 720 # one month chunks. c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks)) @@ -156,7 +156,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values) now = tfmt(np.datetime64('now'), 's') results_name = f'global_avg_sst_{start}_to_{end}.{now}' -if args.cluster or args.mem_opt_cluster: +if args.cluster or args.memory_opt_cluster: df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/') else: df.to_csv(results_name + '_*.csv') From 9f901bba6bc7db6f7d2167283c293ea0130b38c1 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 17:58:30 +0530 Subject: [PATCH 08/16] Fix cluster VM argument. --- demo/sst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 99f5134..f531471 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -80,7 +80,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: cluster = Cluster( region='us-central1', spot_policy='spot_with_fallback', - worker_mv_types=['t2a-standard-16'], # 4 GiBs RAM per CPU, ARM. + worker_vm_types='t2a-standard-16', # 4 GiBs RAM per CPU, ARM. ) client = cluster.get_client() @@ -91,7 +91,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: cluster = Cluster( region='us-central1', spot_policy='spot_with_fallback', - worker_vm_types=['m3-ultramem-32'], # 30.5 GiBs RAM per CPU, x86. + worker_vm_types='m3-ultramem-32', # 30.5 GiBs RAM per CPU, x86. ) client = cluster.get_client() From 433bc6e986d263441c5852d8d2d4fd1368e76150 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 18:42:59 +0530 Subject: [PATCH 09/16] Safer alternative to time ranges. --- demo/sst.py | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index f531471..0e034ec 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -13,14 +13,25 @@ import xarray as xr import xarray_sql as qr - -def rand_wx(start_time: str, end_time: str) -> xr.Dataset: +# Instead of letting users choose arbitrary time frames, we only allow +# the following choices. This design prevents users from accidentally +# processing way more data than they might have meant to. We don't +# want to bankrupt folks because they were off a few digits. +TIMEFRAMES = { + 'day': slice('1940-01-01', '1940-01-02'), + 'month': slice('1940-01-01', '1940-02-01'), + 'year': slice('1940-01-01', '1941-01-01'), + 'all': slice('1940-01-01', '2023-11-01'), +} + + +def rand_wx(times) -> xr.Dataset: """Produce a random ARCO-ERA5-like weather dataset.""" np.random.seed(42) lat = np.linspace(-90, 90, num=720) lon = np.linspace(-180, 180, num=1440) - time = xr.date_range(start_time, end_time, freq='H') + time = xr.date_range(times.start, times.stop, freq='H') level = np.array([1000, 500], dtype=np.int32) temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level)) @@ -44,18 +55,8 @@ def rand_wx(start_time: str, end_time: str) -> xr.Dataset: ) -def tfmt(time: np.datetime64, unit='h') -> str: - """Returns a bucket-friendly date string from a numpy datetime.""" - return np.datetime_as_string(time, unit=unit).replace(':', '') - - parser = argparse.ArgumentParser() -parser.add_argument( - '--start', type=str, default='1940-01-01', help='start time ISO string' -) -parser.add_argument( - '--end', type=str, default='1940-01-02', help='end time ISO string' -) +parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day') parser.add_argument( '--cluster', action='store_true', @@ -73,6 +74,7 @@ def tfmt(time: np.datetime64, unit='h') -> str: ) args = parser.parse_args() +timeframe = TIMEFRAMES[args.timeframe] if args.cluster: from coiled import Cluster @@ -103,29 +105,20 @@ def tfmt(time: np.datetime64, unit='h') -> str: client = cluster.get_client() if args.fake: - era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1}) + era5_ds = rand_wx(timeframe).chunk({'time': 240, 'level': 1}) else: era5_ds = xr.open_zarr( 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/', chunks={'time': 240, 'level': 1}, ) - assert np.datetime64(args.start) >= np.datetime64( - '1940-01-01' - ), 'ARCO-ERA5 does not go back before 1940-01-01!' - - assert ( - np.datetime64(args.end) <= era5_ds.time[-1].values - ), f'ARCO-ERA5 does not run until {args.end}!' - print('dataset opened.') era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( - time=slice(args.start, args.end), - level=1000, # surface level only. + time=timeframe, level=1000 ) -print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs') +print(f'sst_size={era5_sst_ds.nbytes / 2**40:.5f}TiBs') c = qr.Context() # `time=48` produces 190 MiB chunks @@ -153,9 +146,8 @@ def tfmt(time: np.datetime64, unit='h') -> str: ) # Store the results for visualization later on. -start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values) -now = tfmt(np.datetime64('now'), 's') -results_name = f'global_avg_sst_{start}_to_{end}.{now}' +now = np.datetime_as_string(np.datetime64('now'), unit='s').replace(':', '') +results_name = f'global_avg_sst_{timeframe.start}_to_{timeframe.stop}.{now}' if args.cluster or args.memory_opt_cluster: df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/') else: From 530d7a0d6089b385ccb48fef7990683ce4bed97a Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 18:53:55 +0530 Subject: [PATCH 10/16] Choices for cluster; simplifying output name. --- demo/sst.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 0e034ec..6475e20 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -24,6 +24,8 @@ 'all': slice('1940-01-01', '2023-11-01'), } +CLUSTERS = ['local', 'arm', 'mem-opt'] + def rand_wx(times) -> xr.Dataset: """Produce a random ARCO-ERA5-like weather dataset.""" @@ -59,13 +61,10 @@ def rand_wx(times) -> xr.Dataset: parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day') parser.add_argument( '--cluster', - action='store_true', - help='deploy on coiled cluster, default: local cluster', -) -parser.add_argument( - '--memory-opt-cluster', - action='store_true', - help='deploy on memory-optimized coiled cluster, default: local cluster', + choices=CLUSTERS, + default='local', + help='Choose the Dask cluster type. ' + 'Either: a local cluster, ARM VMs or memory-optimized VMs in GCP via Coiled.', ) parser.add_argument( '--fake', @@ -76,7 +75,7 @@ def rand_wx(times) -> xr.Dataset: args = parser.parse_args() timeframe = TIMEFRAMES[args.timeframe] -if args.cluster: +if args.cluster == 'arm': from coiled import Cluster cluster = Cluster( @@ -87,7 +86,7 @@ def rand_wx(times) -> xr.Dataset: client = cluster.get_client() cluster.adapt(minimum=1, maximum=100) -elif args.memory_opt_cluster: +elif args.cluster == 'mem-opt': from coiled import Cluster cluster = Cluster( @@ -127,7 +126,7 @@ def rand_wx(times) -> xr.Dataset: # `time=240` produces 950 MiB chunks # `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU. time_chunks = 96 # four day chunks. -if args.memory_opt_cluster: +if args.cluster == 'mem-opt': time_chunks = 720 # one month chunks. c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks)) @@ -146,9 +145,9 @@ def rand_wx(times) -> xr.Dataset: ) # Store the results for visualization later on. -now = np.datetime_as_string(np.datetime64('now'), unit='s').replace(':', '') -results_name = f'global_avg_sst_{timeframe.start}_to_{timeframe.stop}.{now}' -if args.cluster or args.memory_opt_cluster: - df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/') -else: +now = np.datetime64('now', 's').astype(int) +results_name = f'global_avg_sst_{args.timeframe}_{now}' +if args.cluster == 'local': df.to_csv(results_name + '_*.csv') +else: + df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/') From 0256bff1118c244382bbb223e99b67731338813b Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 19:17:51 +0530 Subject: [PATCH 11/16] Added a "small" cluster as an option. Added more docs for setting up cloud resources and how to run. --- demo/sst.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 6475e20..ecaf980 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -4,7 +4,12 @@ Please run the following to set up cloud resources: ``` gcloud auth application-default login -coiled setup +coiled login +coiled setup gcp --region us-central1 +``` +To run the demo: +``` +./demo/sst.py --timeframe month --cluster small ``` """ import argparse @@ -24,7 +29,7 @@ 'all': slice('1940-01-01', '2023-11-01'), } -CLUSTERS = ['local', 'arm', 'mem-opt'] +CLUSTERS = ['local', 'small', 'arm', 'mem-opt'] def rand_wx(times) -> xr.Dataset: @@ -75,13 +80,23 @@ def rand_wx(times) -> xr.Dataset: args = parser.parse_args() timeframe = TIMEFRAMES[args.timeframe] -if args.cluster == 'arm': +if args.cluster == 'small': from coiled import Cluster cluster = Cluster( region='us-central1', spot_policy='spot_with_fallback', - worker_vm_types='t2a-standard-16', # 4 GiBs RAM per CPU, ARM. + n_workers=8, + ) + + client = cluster.get_client() +elif args.cluster == 'arm': + from coiled import Cluster + + cluster = Cluster( + region='us-central1', + spot_policy='spot_with_fallback', + arm=True, ) client = cluster.get_client() From d0b3eada12de33bf2e46183a3cbb7ff1a24e7593 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 30 Mar 2024 19:35:37 +0530 Subject: [PATCH 12/16] Fixed bug found from testing Zarr on local cluster. --- demo/sst.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index ecaf980..1f069b6 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -47,7 +47,7 @@ def rand_wx(times) -> xr.Dataset: return xr.Dataset( data_vars=dict( sea_surface_temperature=( - ['lat', 'lon', 'time', 'level'], + ['lat', 'lon', 'time'], temperature, ), precipitation=(['lat', 'lon', 'time', 'level'], precipitation), @@ -129,10 +129,10 @@ def rand_wx(times) -> xr.Dataset: print('dataset opened.') era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( - time=timeframe, level=1000 + time=timeframe ) -print(f'sst_size={era5_sst_ds.nbytes / 2**40:.5f}TiBs') +print(f'sst_size={era5_sst_ds.nbytes / 2**30:.5f} GiBs') c = qr.Context() # `time=48` produces 190 MiB chunks From 1517f853e339f50d4014461112c18be62ffb92f7 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Mon, 8 Apr 2024 16:44:53 +0530 Subject: [PATCH 13/16] Adding "fake" prefix to results file name when fake data is used. --- demo/sst.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/demo/sst.py b/demo/sst.py index 1f069b6..c179478 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -162,6 +162,8 @@ def rand_wx(times) -> xr.Dataset: # Store the results for visualization later on. now = np.datetime64('now', 's').astype(int) results_name = f'global_avg_sst_{args.timeframe}_{now}' +if args.fake: + results_name = 'fake_' + results_name if args.cluster == 'local': df.to_csv(results_name + '_*.csv') else: From 4a18c85faca9c9686334e0a2e96bf452df55742a Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Mon, 8 Apr 2024 17:08:18 +0530 Subject: [PATCH 14/16] dask-expr needed to run on coiled. error found on deployment. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2242507..24b8d4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ classifiers = [ dependencies = [ "xarray", "dask-sql", + "dask-expr", ] [project.optional-dependencies] From 2edab31e7ad904c0acd22a654f2363540b69c7ff Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sat, 13 Apr 2024 09:47:44 +0400 Subject: [PATCH 15/16] Turning on dask-expr for dask-sql to work. --- demo/sst.py | 2 ++ pyproject.toml | 11 +++++++---- xarray_sql/df.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index c179478..2da54b0 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -3,6 +3,7 @@ Please run the following to set up cloud resources: ``` +pip install ".[demo]" gcloud auth application-default login coiled login coiled setup gcp --region us-central1 @@ -62,6 +63,7 @@ def rand_wx(times) -> xr.Dataset: ) +# TODO(alxmrs): Make spot instances a flag. parser = argparse.ArgumentParser() parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day') parser.add_argument( diff --git a/pyproject.toml b/pyproject.toml index 24b8d4f..3d35110 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,22 +28,25 @@ classifiers = [ dependencies = [ "xarray", "dask-sql", - "dask-expr", ] [project.optional-dependencies] -test = [ - "pytest", +io = [ "xarray[io]", "gcsfs", ] +test = [ + "xarray_sql[io]", + "pytest", +] dev = [ "xarray_sql[test]", "pyink", "py-spy" ] demo = [ - "coiled" + "xarray_sql[io]", + "coiled", ] [project.urls] diff --git a/xarray_sql/df.py b/xarray_sql/df.py index 1844328..a2c594c 100644 --- a/xarray_sql/df.py +++ b/xarray_sql/df.py @@ -15,7 +15,7 @@ # Turn on Dask-Expr dask.config.set({'dataframe.query-planning-warning': False}) -dask.config.set({'dataframe.query-planning': True}) +dask.config.set({'dataframe.query-planning': False}) # Turn on Copy-On-Write (needs Pandas 2.0). pd.options.mode.copy_on_write = True From da8084f0210142cee476b0a364ad574abc35524c Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Sun, 9 Jun 2024 15:17:50 +0200 Subject: [PATCH 16/16] reformatted sst script. --- demo/sst.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/demo/sst.py b/demo/sst.py index 2da54b0..a0f16d3 100755 --- a/demo/sst.py +++ b/demo/sst.py @@ -96,9 +96,9 @@ def rand_wx(times) -> xr.Dataset: from coiled import Cluster cluster = Cluster( - region='us-central1', - spot_policy='spot_with_fallback', - arm=True, + region='us-central1', + spot_policy='spot_with_fallback', + arm=True, ) client = cluster.get_client() @@ -130,9 +130,7 @@ def rand_wx(times) -> xr.Dataset: print('dataset opened.') -era5_sst_ds = era5_ds[['sea_surface_temperature']].sel( - time=timeframe -) +era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(time=timeframe) print(f'sst_size={era5_sst_ds.nbytes / 2**30:.5f} GiBs')