From 59a849bd3b79cf49a056bd0ec81478cd402852ac Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 23 Mar 2024 20:55:21 +0530
Subject: [PATCH 01/16] Simplest version of the SST Demo.

---
 demo/sst | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 demo/sst

diff --git a/demo/sst b/demo/sst
new file mode 100644
index 0000000..6e38b6e
--- /dev/null
+++ b/demo/sst
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""Demo of calculating global average sea surface temperature (SST) with SQL.
+
+Please run the following to access the ERA5 dataset:
+```
+gcloud auth application-default login
+```
+"""
+import xarray as xr
+import xarray_sql as qr
+
+# TODO(alxmrs): Add coiled or dask cluster code. 
+
+era5_ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/'
+  '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
+  chunks={'time': 240, 'level': 1}
+)
+print('dataset opened.')
+# TODO(alxmrs): Slice to small time range based on script args.
+era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
+  level=1000,  # surface level only.
+)
+
+# chunk sizes determined from VM memory limit of 16 GB.
+c = qr.Context()
+c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
+
+print('beginning query.')
+df = c.sql("""
+SELECT
+  DATE("time") as date,
+  AVG("sea_surface_temperature") as daily_avg_sst
+FROM 
+  "era5" 
+GROUP BY
+  DATE("time")
+""")
+
+# TODO(alxmrs): time slice should be in file name. 
+df.to_csv('global_avg_sst_*.cvs')
\ No newline at end of file

From 40ac2c80c5cee1c4a2181fc5465b57d6518732fc Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 23 Mar 2024 21:17:09 +0530
Subject: [PATCH 02/16] Added coiled cluster config.

---
 demo/sst | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/demo/sst b/demo/sst
index 6e38b6e..178bfa4 100644
--- a/demo/sst
+++ b/demo/sst
@@ -9,7 +9,16 @@ gcloud auth application-default login
 import xarray as xr
 import xarray_sql as qr
 
-# TODO(alxmrs): Add coiled or dask cluster code. 
+from coiled import Cluster
+
+cluster = Cluster(
+  region='us-central1',
+  worker_memory='16 GiB',
+  spot_policy='spot_with_fallback',
+  arm=True,
+)
+client = cluster.get_client()
+cluster.adapt(minimum=1, maximum=100)
 
 era5_ds = xr.open_zarr(
   'gs://gcp-public-data-arco-era5/ar/'
@@ -22,7 +31,7 @@ era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
   level=1000,  # surface level only.
 )
 
-# chunk sizes determined from VM memory limit of 16 GB.
+# chunk sizes determined from VM memory limit of 16 GiB.
 c = qr.Context()
 c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
 

From b29719a2ed1d7c1face0763e62802b497330aa06 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 23 Mar 2024 22:50:39 +0530
Subject: [PATCH 03/16] Added script arguments.

---
 demo/sst       | 50 ---------------------------------------
 demo/sst.py    | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml |  3 +++
 3 files changed, 66 insertions(+), 50 deletions(-)
 delete mode 100644 demo/sst
 create mode 100755 demo/sst.py

diff --git a/demo/sst b/demo/sst
deleted file mode 100644
index 178bfa4..0000000
--- a/demo/sst
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-"""Demo of calculating global average sea surface temperature (SST) with SQL.
-
-Please run the following to access the ERA5 dataset:
-```
-gcloud auth application-default login
-```
-"""
-import xarray as xr
-import xarray_sql as qr
-
-from coiled import Cluster
-
-cluster = Cluster(
-  region='us-central1',
-  worker_memory='16 GiB',
-  spot_policy='spot_with_fallback',
-  arm=True,
-)
-client = cluster.get_client()
-cluster.adapt(minimum=1, maximum=100)
-
-era5_ds = xr.open_zarr(
-  'gs://gcp-public-data-arco-era5/ar/'
-  '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
-  chunks={'time': 240, 'level': 1}
-)
-print('dataset opened.')
-# TODO(alxmrs): Slice to small time range based on script args.
-era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
-  level=1000,  # surface level only.
-)
-
-# chunk sizes determined from VM memory limit of 16 GiB.
-c = qr.Context()
-c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
-
-print('beginning query.')
-df = c.sql("""
-SELECT
-  DATE("time") as date,
-  AVG("sea_surface_temperature") as daily_avg_sst
-FROM 
-  "era5" 
-GROUP BY
-  DATE("time")
-""")
-
-# TODO(alxmrs): time slice should be in file name. 
-df.to_csv('global_avg_sst_*.cvs')
\ No newline at end of file
diff --git a/demo/sst.py b/demo/sst.py
new file mode 100755
index 0000000..1482ec5
--- /dev/null
+++ b/demo/sst.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""Demo of calculating global average sea surface temperature (SST) with SQL.
+
+Please run the following to set up cloud resources:
+```
+gcloud auth application-default login
+coiled setup
+```
+"""
+import argparse
+import xarray as xr
+import xarray_sql as qr
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--start', type=str, default='2020-01-01', help='start time ISO string')
+parser.add_argument('--end', type=str, default='2020-01-02', help='end time ISO string')
+parser.add_argument('--cluster', action='store_true', help='deploy on coiled cluster')
+
+args = parser.parse_args()
+
+if args.cluster:
+  from coiled import Cluster
+
+  cluster = Cluster(
+    region='us-central1',
+    worker_memory='16 GiB',
+    spot_policy='spot_with_fallback',
+    arm=True,
+  )
+  client = cluster.get_client()
+  cluster.adapt(minimum=1, maximum=100)
+else:
+  from dask.distributed import LocalCluster
+  cluster = LocalCluster(processes=False)
+  client = cluster.get_client()
+
+era5_ds = xr.open_zarr(
+  'gs://gcp-public-data-arco-era5/ar/'
+  '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
+  chunks={'time': 240, 'level': 1}
+)
+print('dataset opened.')
+era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
+  time=slice(args.start, args.end),
+  level=1000,  # surface level only.
+)
+
+c = qr.Context()
+# chunk sizes determined from VM memory limit of 16 GiB.
+c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
+
+print('beginning query.')
+df = c.sql("""
+SELECT
+  DATE("time") as date,
+  AVG("sea_surface_temperature") as daily_avg_sst
+FROM
+  "era5"
+GROUP BY
+  DATE("time")
+""")
+
+df.to_csv(f'global_avg_sst_{args.start}-{args.end}_*.cvs')
diff --git a/pyproject.toml b/pyproject.toml
index 2d6eeb9..2242507 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,9 @@ dev = [
   "pyink",
   "py-spy"
 ]
+demo = [
+  "coiled"
+]
 
 [project.urls]
 Homepage = "https://github.com/alxmrs/xarray-sql"

From e63ac4e90a15ab573abb826bcd848c12c1e11063 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sun, 24 Mar 2024 16:45:12 +0530
Subject: [PATCH 04/16] SST demo works with local fake data.

---
 demo/sst.py | 94 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 19 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 1482ec5..6b83e75 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -11,10 +11,58 @@
 import xarray as xr
 import xarray_sql as qr
 
+
+def local_data(start: str, end: str) -> xr.Dataset:
+  import numpy as np
+  import pandas as pd
+
+  np.random.seed(42)
+
+  lat = np.linspace(-90, 90, num=720)
+  lon = np.linspace(-180, 180, num=1440)
+  time = pd.date_range(start, end, freq='H')
+  level = np.array([1000, 500], dtype=np.int32)
+  reference_time = pd.Timestamp(start)
+
+  temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
+  precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
+
+  return xr.Dataset(
+      data_vars=dict(
+          sea_surface_temperature=(
+              ['lat', 'lon', 'time', 'level'],
+              temperature,
+          ),
+          precipitation=(['lat', 'lon', 'time', 'level'], precipitation),
+      ),
+      coords=dict(
+          lat=lat,
+          lon=lon,
+          time=time,
+          level=level,
+          reference_time=reference_time,
+      ),
+      attrs=dict(description='Random weather.'),
+  )
+
+
 parser = argparse.ArgumentParser()
-parser.add_argument('--start', type=str, default='2020-01-01', help='start time ISO string')
-parser.add_argument('--end', type=str, default='2020-01-02', help='end time ISO string')
-parser.add_argument('--cluster', action='store_true', help='deploy on coiled cluster')
+parser.add_argument(
+    '--start', type=str, default='2020-01-01', help='start time ISO string'
+)
+parser.add_argument(
+    '--end', type=str, default='2020-01-02', help='end time ISO string'
+)
+parser.add_argument(
+    '--cluster',
+    action='store_true',
+    help='deploy on coiled cluster, default: local cluster',
+)
+parser.add_argument(
+    '--fake',
+    action='store_true',
+    help='use local dummy data, default: ARCO-ERA5 data',
+)
 
 args = parser.parse_args()
 
@@ -22,27 +70,32 @@
   from coiled import Cluster
 
   cluster = Cluster(
-    region='us-central1',
-    worker_memory='16 GiB',
-    spot_policy='spot_with_fallback',
-    arm=True,
+      region='us-central1',
+      worker_memory='16 GiB',
+      spot_policy='spot_with_fallback',
+      arm=True,
   )
   client = cluster.get_client()
   cluster.adapt(minimum=1, maximum=100)
 else:
   from dask.distributed import LocalCluster
+
   cluster = LocalCluster(processes=False)
   client = cluster.get_client()
 
-era5_ds = xr.open_zarr(
-  'gs://gcp-public-data-arco-era5/ar/'
-  '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
-  chunks={'time': 240, 'level': 1}
-)
+if args.fake:
+  era5_ds = local_data(args.start, args.end).chunk({'time': 240, 'level': 1})
+else:
+  era5_ds = xr.open_zarr(
+      'gs://gcp-public-data-arco-era5/ar/'
+      '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
+      chunks={'time': 240, 'level': 1},
+  )
+
 print('dataset opened.')
 era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
-  time=slice(args.start, args.end),
-  level=1000,  # surface level only.
+    time=slice(args.start, args.end),
+    level=1000,  # surface level only.
 )
 
 c = qr.Context()
@@ -50,14 +103,17 @@
 c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
 
 print('beginning query.')
-df = c.sql("""
+# TODO(alxmrs): `DATE` function is not supported in Apache Calcite out-of-the-box.
+df = c.sql(
+    """
 SELECT
-  DATE("time") as date,
+  "time",
   AVG("sea_surface_temperature") as daily_avg_sst
 FROM
   "era5"
 GROUP BY
-  DATE("time")
-""")
+  "time"
+"""
+)
 
-df.to_csv(f'global_avg_sst_{args.start}-{args.end}_*.cvs')
+df.to_csv(f'global_avg_sst_{args.start}_to_{args.end}_*.cvs')

From f39acf25e7d78bb7c87c3158828a0c87e16ddc36 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Thu, 28 Mar 2024 11:45:17 +0530
Subject: [PATCH 05/16] Renamed a method; added memory-optimized cluster
 option.

---
 demo/sst.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 6b83e75..0fe1274 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -12,7 +12,7 @@
 import xarray_sql as qr
 
 
-def local_data(start: str, end: str) -> xr.Dataset:
+def rand_wx(start: str, end: str) -> xr.Dataset:
   import numpy as np
   import pandas as pd
 
@@ -58,6 +58,11 @@ def local_data(start: str, end: str) -> xr.Dataset:
     action='store_true',
     help='deploy on coiled cluster, default: local cluster',
 )
+parser.add_argument(
+    '--memory-opt-cluster',
+    action='store_true',
+    help='deploy on memory-optimized coiled cluster, default: local cluster',
+)
 parser.add_argument(
     '--fake',
     action='store_true',
@@ -75,8 +80,21 @@ def local_data(start: str, end: str) -> xr.Dataset:
       spot_policy='spot_with_fallback',
       arm=True,
   )
+
   client = cluster.get_client()
   cluster.adapt(minimum=1, maximum=100)
+elif args.mem_opt_cluster:
+    from coiled import Cluster
+
+    cluster = Cluster(
+        region='us-central1',
+        spot_policy='spot_with_fallback',
+        worker_vm_types=['m3-ultramem-32'],
+        arm=True,
+    )
+
+    client = cluster.get_client()
+    cluster.adapt(minimum=1, maximum=50)
 else:
   from dask.distributed import LocalCluster
 
@@ -84,7 +102,7 @@ def local_data(start: str, end: str) -> xr.Dataset:
   client = cluster.get_client()
 
 if args.fake:
-  era5_ds = local_data(args.start, args.end).chunk({'time': 240, 'level': 1})
+  era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1})
 else:
   era5_ds = xr.open_zarr(
       'gs://gcp-public-data-arco-era5/ar/'

From 6256487a0a3619e5d1b729f40fa1c5ae83f980e7 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Thu, 28 Mar 2024 14:08:16 +0530
Subject: [PATCH 06/16] Details focused updates.

- Using the v3 ARCO-ERA5 dataset that has the full range of data.
- Looking up VM instance types to see what's appropriate
- Choosing chunks based on resource and dataset size math.
- Writing output to parquet when running on a cluster.
---
 demo/sst.py | 79 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 0fe1274..78b8b6a 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -8,21 +8,20 @@
 ```
 """
 import argparse
+
+import numpy as np
 import xarray as xr
 import xarray_sql as qr
 
 
-def rand_wx(start: str, end: str) -> xr.Dataset:
-  import numpy as np
-  import pandas as pd
-
+def rand_wx(start_time: str, end_time: str) -> xr.Dataset:
+  """Produce a random ARCO-ERA5-like weather dataset."""
   np.random.seed(42)
 
   lat = np.linspace(-90, 90, num=720)
   lon = np.linspace(-180, 180, num=1440)
-  time = pd.date_range(start, end, freq='H')
+  time = xr.date_range(start_time, end_time, freq='H')
   level = np.array([1000, 500], dtype=np.int32)
-  reference_time = pd.Timestamp(start)
 
   temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
   precipitation = 10 * np.random.rand(720, 1440, len(time), len(level))
@@ -40,18 +39,22 @@ def rand_wx(start: str, end: str) -> xr.Dataset:
           lon=lon,
           time=time,
           level=level,
-          reference_time=reference_time,
       ),
       attrs=dict(description='Random weather.'),
   )
 
 
+def tfmt(time: np.datetime64, unit='h') -> str:
+  """Returns a bucket-friendly date string from a numpy datetime."""
+  return np.datetime_as_string(time, unit=unit).replace(':', '')
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    '--start', type=str, default='2020-01-01', help='start time ISO string'
+    '--start', type=str, default='1940-01-01', help='start time ISO string'
 )
 parser.add_argument(
-    '--end', type=str, default='2020-01-02', help='end time ISO string'
+    '--end', type=str, default='1940-01-02', help='end time ISO string'
 )
 parser.add_argument(
     '--cluster',
@@ -76,25 +79,23 @@ def rand_wx(start: str, end: str) -> xr.Dataset:
 
   cluster = Cluster(
       region='us-central1',
-      worker_memory='16 GiB',
       spot_policy='spot_with_fallback',
-      arm=True,
+      worker_mv_types=['t2a-standard-16'],  # 4 GiBs RAM per CPU, ARM.
   )
 
   client = cluster.get_client()
   cluster.adapt(minimum=1, maximum=100)
 elif args.mem_opt_cluster:
-    from coiled import Cluster
+  from coiled import Cluster
 
-    cluster = Cluster(
-        region='us-central1',
-        spot_policy='spot_with_fallback',
-        worker_vm_types=['m3-ultramem-32'],
-        arm=True,
-    )
+  cluster = Cluster(
+      region='us-central1',
+      spot_policy='spot_with_fallback',
+      worker_vm_types=['m3-ultramem-32'],  # 30.5 GiBs RAM per CPU, x86.
+  )
 
-    client = cluster.get_client()
-    cluster.adapt(minimum=1, maximum=50)
+  client = cluster.get_client()
+  cluster.adapt(minimum=1, maximum=25)
 else:
   from dask.distributed import LocalCluster
 
@@ -105,20 +106,37 @@ def rand_wx(start: str, end: str) -> xr.Dataset:
   era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1})
 else:
   era5_ds = xr.open_zarr(
-      'gs://gcp-public-data-arco-era5/ar/'
-      '1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2',
+      'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/',
       chunks={'time': 240, 'level': 1},
   )
 
+  assert np.datetime64(args.start) >= np.datetime64(
+      '1940-01-01'
+  ), 'ARCO-ERA5 does not go back before 1940-01-01!'
+
+  assert (
+      np.datetime64(args.end) <= era5_ds.time[-1].values
+  ), f'ARCO-ERA5 does not run until {args.end}!'
+
 print('dataset opened.')
-era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
+
+era5_sst_ds = era5_ds.sel(
     time=slice(args.start, args.end),
     level=1000,  # surface level only.
-)
+).sea_surface_temperature
+
+print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs')
 
 c = qr.Context()
-# chunk sizes determined from VM memory limit of 16 GiB.
-c.create_table('era5', era5_sst_ds, chunks=dict(time=24))
+# `time=48` produces 190 MiB chunks
+# `time=96` produces 380 MiB chunks
+# `time=192` produces 760 MiB chunks
+# `time=240` produces 950 MiB chunks
+# `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU.
+time_chunks = 96  # four day chunks.
+if args.mem_opt_cluster:
+  time_chunks = 720  # one month chunks.
+c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks))
 
 print('beginning query.')
 # TODO(alxmrs): `DATE` function is not supported in Apache Calcite out-of-the-box.
@@ -134,4 +152,11 @@ def rand_wx(start: str, end: str) -> xr.Dataset:
 """
 )
 
-df.to_csv(f'global_avg_sst_{args.start}_to_{args.end}_*.cvs')
+# Store the results for visualization later on.
+start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values)
+now = tfmt(np.datetime64('now'), 's')
+results_name = f'global_avg_sst_{start}_to_{end}.{now}'
+if args.cluster or args.mem_opt_cluster:
+  df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/')
+else:
+  df.to_csv(results_name + '_*.csv')

From 9dfaebfbd30b88bb6cddaa6a6c112e16f5ff39ca Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 13:21:21 +0530
Subject: [PATCH 07/16] Fixed issues found with fake data.

---
 demo/sst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 78b8b6a..99f5134 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -85,7 +85,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 
   client = cluster.get_client()
   cluster.adapt(minimum=1, maximum=100)
-elif args.mem_opt_cluster:
+elif args.memory_opt_cluster:
   from coiled import Cluster
 
   cluster = Cluster(
@@ -120,10 +120,10 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 
 print('dataset opened.')
 
-era5_sst_ds = era5_ds.sel(
+era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
     time=slice(args.start, args.end),
     level=1000,  # surface level only.
-).sea_surface_temperature
+)
 
 print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs')
 
@@ -134,7 +134,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 # `time=240` produces 950 MiB chunks
 # `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU.
 time_chunks = 96  # four day chunks.
-if args.mem_opt_cluster:
+if args.memory_opt_cluster:
   time_chunks = 720  # one month chunks.
 c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks))
 
@@ -156,7 +156,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values)
 now = tfmt(np.datetime64('now'), 's')
 results_name = f'global_avg_sst_{start}_to_{end}.{now}'
-if args.cluster or args.mem_opt_cluster:
+if args.cluster or args.memory_opt_cluster:
   df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/')
 else:
   df.to_csv(results_name + '_*.csv')

From 9f901bba6bc7db6f7d2167283c293ea0130b38c1 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 17:58:30 +0530
Subject: [PATCH 08/16] Fix cluster VM argument.

---
 demo/sst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 99f5134..f531471 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -80,7 +80,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
   cluster = Cluster(
       region='us-central1',
       spot_policy='spot_with_fallback',
-      worker_mv_types=['t2a-standard-16'],  # 4 GiBs RAM per CPU, ARM.
+      worker_vm_types='t2a-standard-16',  # 4 GiBs RAM per CPU, ARM.
   )
 
   client = cluster.get_client()
@@ -91,7 +91,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
   cluster = Cluster(
       region='us-central1',
       spot_policy='spot_with_fallback',
-      worker_vm_types=['m3-ultramem-32'],  # 30.5 GiBs RAM per CPU, x86.
+      worker_vm_types='m3-ultramem-32',  # 30.5 GiBs RAM per CPU, x86.
   )
 
   client = cluster.get_client()

From 433bc6e986d263441c5852d8d2d4fd1368e76150 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 18:42:59 +0530
Subject: [PATCH 09/16] Safer alternative to time ranges.

---
 demo/sst.py | 50 +++++++++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index f531471..0e034ec 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -13,14 +13,25 @@
 import xarray as xr
 import xarray_sql as qr
 
-
-def rand_wx(start_time: str, end_time: str) -> xr.Dataset:
+# Instead of letting users choose arbitrary time frames, we only allow
+# the following choices. This design prevents users from accidentally
+# processing way more data than they might have meant to. We don't
+# want to bankrupt folks because they were off a few digits.
+TIMEFRAMES = {
+    'day': slice('1940-01-01', '1940-01-02'),
+    'month': slice('1940-01-01', '1940-02-01'),
+    'year': slice('1940-01-01', '1941-01-01'),
+    'all': slice('1940-01-01', '2023-11-01'),
+}
+
+
+def rand_wx(times) -> xr.Dataset:
   """Produce a random ARCO-ERA5-like weather dataset."""
   np.random.seed(42)
 
   lat = np.linspace(-90, 90, num=720)
   lon = np.linspace(-180, 180, num=1440)
-  time = xr.date_range(start_time, end_time, freq='H')
+  time = xr.date_range(times.start, times.stop, freq='H')
   level = np.array([1000, 500], dtype=np.int32)
 
   temperature = 15 + 8 * np.random.randn(720, 1440, len(time), len(level))
@@ -44,18 +55,8 @@ def rand_wx(start_time: str, end_time: str) -> xr.Dataset:
   )
 
 
-def tfmt(time: np.datetime64, unit='h') -> str:
-  """Returns a bucket-friendly date string from a numpy datetime."""
-  return np.datetime_as_string(time, unit=unit).replace(':', '')
-
-
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--start', type=str, default='1940-01-01', help='start time ISO string'
-)
-parser.add_argument(
-    '--end', type=str, default='1940-01-02', help='end time ISO string'
-)
+parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day')
 parser.add_argument(
     '--cluster',
     action='store_true',
@@ -73,6 +74,7 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 )
 
 args = parser.parse_args()
+timeframe = TIMEFRAMES[args.timeframe]
 
 if args.cluster:
   from coiled import Cluster
@@ -103,29 +105,20 @@ def tfmt(time: np.datetime64, unit='h') -> str:
   client = cluster.get_client()
 
 if args.fake:
-  era5_ds = rand_wx(args.start, args.end).chunk({'time': 240, 'level': 1})
+  era5_ds = rand_wx(timeframe).chunk({'time': 240, 'level': 1})
 else:
   era5_ds = xr.open_zarr(
       'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/',
       chunks={'time': 240, 'level': 1},
   )
 
-  assert np.datetime64(args.start) >= np.datetime64(
-      '1940-01-01'
-  ), 'ARCO-ERA5 does not go back before 1940-01-01!'
-
-  assert (
-      np.datetime64(args.end) <= era5_ds.time[-1].values
-  ), f'ARCO-ERA5 does not run until {args.end}!'
-
 print('dataset opened.')
 
 era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
-    time=slice(args.start, args.end),
-    level=1000,  # surface level only.
+    time=timeframe, level=1000
 )
 
-print(f'sst_size={era5_sst_ds.nbytes / 2**40}TiBs')
+print(f'sst_size={era5_sst_ds.nbytes / 2**40:.5f}TiBs')
 
 c = qr.Context()
 # `time=48` produces 190 MiB chunks
@@ -153,9 +146,8 @@ def tfmt(time: np.datetime64, unit='h') -> str:
 )
 
 # Store the results for visualization later on.
-start, end = tfmt(era5_sst_ds.time[0].values), tfmt(era5_sst_ds.time[-1].values)
-now = tfmt(np.datetime64('now'), 's')
-results_name = f'global_avg_sst_{start}_to_{end}.{now}'
+now = np.datetime_as_string(np.datetime64('now'), unit='s').replace(':', '')
+results_name = f'global_avg_sst_{timeframe.start}_to_{timeframe.stop}.{now}'
 if args.cluster or args.memory_opt_cluster:
   df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/')
 else:

From 530d7a0d6089b385ccb48fef7990683ce4bed97a Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 18:53:55 +0530
Subject: [PATCH 10/16] Choices for cluster; simplifying output name.

---
 demo/sst.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 0e034ec..6475e20 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -24,6 +24,8 @@
     'all': slice('1940-01-01', '2023-11-01'),
 }
 
+CLUSTERS = ['local', 'arm', 'mem-opt']
+
 
 def rand_wx(times) -> xr.Dataset:
   """Produce a random ARCO-ERA5-like weather dataset."""
@@ -59,13 +61,10 @@ def rand_wx(times) -> xr.Dataset:
 parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day')
 parser.add_argument(
     '--cluster',
-    action='store_true',
-    help='deploy on coiled cluster, default: local cluster',
-)
-parser.add_argument(
-    '--memory-opt-cluster',
-    action='store_true',
-    help='deploy on memory-optimized coiled cluster, default: local cluster',
+    choices=CLUSTERS,
+    default='local',
+    help='Choose the Dask cluster type. '
+    'Either: a local cluster, ARM VMs or memory-optimized VMs in GCP via Coiled.',
 )
 parser.add_argument(
     '--fake',
@@ -76,7 +75,7 @@ def rand_wx(times) -> xr.Dataset:
 args = parser.parse_args()
 timeframe = TIMEFRAMES[args.timeframe]
 
-if args.cluster:
+if args.cluster == 'arm':
   from coiled import Cluster
 
   cluster = Cluster(
@@ -87,7 +86,7 @@ def rand_wx(times) -> xr.Dataset:
 
   client = cluster.get_client()
   cluster.adapt(minimum=1, maximum=100)
-elif args.memory_opt_cluster:
+elif args.cluster == 'mem-opt':
   from coiled import Cluster
 
   cluster = Cluster(
@@ -127,7 +126,7 @@ def rand_wx(times) -> xr.Dataset:
 # `time=240` produces 950 MiB chunks
 # `time=720` produces 2851 MiB chunks --> utilizes 30 GiBs memory per CPU.
 time_chunks = 96  # four day chunks.
-if args.memory_opt_cluster:
+if args.cluster == 'mem-opt':
   time_chunks = 720  # one month chunks.
 c.create_table('era5', era5_sst_ds, chunks=dict(time=time_chunks))
 
@@ -146,9 +145,9 @@ def rand_wx(times) -> xr.Dataset:
 )
 
 # Store the results for visualization later on.
-now = np.datetime_as_string(np.datetime64('now'), unit='s').replace(':', '')
-results_name = f'global_avg_sst_{timeframe.start}_to_{timeframe.stop}.{now}'
-if args.cluster or args.memory_opt_cluster:
-  df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/')
-else:
+now = np.datetime64('now', 's').astype(int)
+results_name = f'global_avg_sst_{args.timeframe}_{now}'
+if args.cluster == 'local':
   df.to_csv(results_name + '_*.csv')
+else:
+  df.to_parquet(f'gs://xarray-sql-experiments/{results_name}/')

From 0256bff1118c244382bbb223e99b67731338813b Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 19:17:51 +0530
Subject: [PATCH 11/16] Added a "small" cluster as an option. Added more docs
 for setting up cloud resources and how to run.

---
 demo/sst.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 6475e20..ecaf980 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -4,7 +4,12 @@
 Please run the following to set up cloud resources:
 ```
 gcloud auth application-default login
-coiled setup
+coiled login
+coiled setup gcp --region us-central1
+```
+To run the demo:
+```
+./demo/sst.py --timeframe month --cluster small
 ```
 """
 import argparse
@@ -24,7 +29,7 @@
     'all': slice('1940-01-01', '2023-11-01'),
 }
 
-CLUSTERS = ['local', 'arm', 'mem-opt']
+CLUSTERS = ['local', 'small', 'arm', 'mem-opt']
 
 
 def rand_wx(times) -> xr.Dataset:
@@ -75,13 +80,23 @@ def rand_wx(times) -> xr.Dataset:
 args = parser.parse_args()
 timeframe = TIMEFRAMES[args.timeframe]
 
-if args.cluster == 'arm':
+if args.cluster == 'small':
   from coiled import Cluster
 
   cluster = Cluster(
       region='us-central1',
       spot_policy='spot_with_fallback',
-      worker_vm_types='t2a-standard-16',  # 4 GiBs RAM per CPU, ARM.
+      n_workers=8,
+  )
+
+  client = cluster.get_client()
+elif args.cluster == 'arm':
+  from coiled import Cluster
+
+  cluster = Cluster(
+    region='us-central1',
+    spot_policy='spot_with_fallback',
+    arm=True,
   )
 
   client = cluster.get_client()

From d0b3eada12de33bf2e46183a3cbb7ff1a24e7593 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 30 Mar 2024 19:35:37 +0530
Subject: [PATCH 12/16] Fixed bug found from testing Zarr on local cluster.

---
 demo/sst.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index ecaf980..1f069b6 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -47,7 +47,7 @@ def rand_wx(times) -> xr.Dataset:
   return xr.Dataset(
       data_vars=dict(
           sea_surface_temperature=(
-              ['lat', 'lon', 'time', 'level'],
+              ['lat', 'lon', 'time'],
               temperature,
           ),
           precipitation=(['lat', 'lon', 'time', 'level'], precipitation),
@@ -129,10 +129,10 @@ def rand_wx(times) -> xr.Dataset:
 print('dataset opened.')
 
 era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
-    time=timeframe, level=1000
+    time=timeframe
 )
 
-print(f'sst_size={era5_sst_ds.nbytes / 2**40:.5f}TiBs')
+print(f'sst_size={era5_sst_ds.nbytes / 2**30:.5f} GiBs')
 
 c = qr.Context()
 # `time=48` produces 190 MiB chunks

From 1517f853e339f50d4014461112c18be62ffb92f7 Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Mon, 8 Apr 2024 16:44:53 +0530
Subject: [PATCH 13/16] Adding "fake" prefix to results file name when fake
 data is used.

---
 demo/sst.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/demo/sst.py b/demo/sst.py
index 1f069b6..c179478 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -162,6 +162,8 @@ def rand_wx(times) -> xr.Dataset:
 # Store the results for visualization later on.
 now = np.datetime64('now', 's').astype(int)
 results_name = f'global_avg_sst_{args.timeframe}_{now}'
+if args.fake:
+  results_name = 'fake_' + results_name
 if args.cluster == 'local':
   df.to_csv(results_name + '_*.csv')
 else:

From 4a18c85faca9c9686334e0a2e96bf452df55742a Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Mon, 8 Apr 2024 17:08:18 +0530
Subject: [PATCH 14/16] dask-expr needed to run on coiled. error found on
 deployment.

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2242507..24b8d4f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ classifiers = [
 dependencies = [
   "xarray",
   "dask-sql",
+  "dask-expr",
 ]
 
 [project.optional-dependencies]

From 2edab31e7ad904c0acd22a654f2363540b69c7ff Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sat, 13 Apr 2024 09:47:44 +0400
Subject: [PATCH 15/16] Turning on dask-expr for dask-sql to work.

---
 demo/sst.py      |  2 ++
 pyproject.toml   | 11 +++++++----
 xarray_sql/df.py |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index c179478..2da54b0 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -3,6 +3,7 @@
 
 Please run the following to set up cloud resources:
 ```
+pip install ".[demo]"
 gcloud auth application-default login
 coiled login
 coiled setup gcp --region us-central1
@@ -62,6 +63,7 @@ def rand_wx(times) -> xr.Dataset:
   )
 
 
+# TODO(alxmrs): Make spot instances a flag.
 parser = argparse.ArgumentParser()
 parser.add_argument('--timeframe', choices=TIMEFRAMES.keys(), default='day')
 parser.add_argument(
diff --git a/pyproject.toml b/pyproject.toml
index 24b8d4f..3d35110 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,22 +28,25 @@ classifiers = [
 dependencies = [
   "xarray",
   "dask-sql",
-  "dask-expr",
 ]
 
 [project.optional-dependencies]
-test = [
-  "pytest",
+io = [
   "xarray[io]",
   "gcsfs",
 ]
+test = [
+  "xarray_sql[io]",
+  "pytest",
+]
 dev = [
   "xarray_sql[test]",
   "pyink",
   "py-spy"
 ]
 demo = [
-  "coiled"
+  "xarray_sql[io]",
+  "coiled",
 ]
 
 [project.urls]
diff --git a/xarray_sql/df.py b/xarray_sql/df.py
index 1844328..a2c594c 100644
--- a/xarray_sql/df.py
+++ b/xarray_sql/df.py
@@ -15,7 +15,7 @@
 
 # Turn on Dask-Expr
 dask.config.set({'dataframe.query-planning-warning': False})
-dask.config.set({'dataframe.query-planning': True})
+dask.config.set({'dataframe.query-planning': False})
 # Turn on Copy-On-Write (needs Pandas 2.0).
 pd.options.mode.copy_on_write = True
 

From da8084f0210142cee476b0a364ad574abc35524c Mon Sep 17 00:00:00 2001
From: Alex Merose <al@merose.com>
Date: Sun, 9 Jun 2024 15:17:50 +0200
Subject: [PATCH 16/16] reformatted sst script.

---
 demo/sst.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/demo/sst.py b/demo/sst.py
index 2da54b0..a0f16d3 100755
--- a/demo/sst.py
+++ b/demo/sst.py
@@ -96,9 +96,9 @@ def rand_wx(times) -> xr.Dataset:
   from coiled import Cluster
 
   cluster = Cluster(
-    region='us-central1',
-    spot_policy='spot_with_fallback',
-    arm=True,
+      region='us-central1',
+      spot_policy='spot_with_fallback',
+      arm=True,
   )
 
   client = cluster.get_client()
@@ -130,9 +130,7 @@ def rand_wx(times) -> xr.Dataset:
 
 print('dataset opened.')
 
-era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(
-    time=timeframe
-)
+era5_sst_ds = era5_ds[['sea_surface_temperature']].sel(time=timeframe)
 
 print(f'sst_size={era5_sst_ds.nbytes / 2**30:.5f} GiBs')