ProjectPythia · norlandrhagen · Jul 18, 2023
diff --git a/notebooks/foundations/03_kerchunk_dask.ipynb b/notebooks/foundations/03_kerchunk_dask.ipynb
@@ -80,6 +80,7 @@
    "outputs": [],
    "source": [
     "import logging\n",
+    "import glob\n",
     "\n",
     "from distributed import Client\n",
     "\n",
@@ -135,6 +136,7 @@
     "import dask\n",
     "import fsspec\n",
     "import ujson\n",
+    "import s3fs\n",
     "import xarray as xr\n",
     "from kerchunk.combine import MultiZarrToZarr\n",
     "from kerchunk.hdf import SingleHdf5ToZarr"
@@ -182,7 +184,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "file_pattern = file_pattern[0:40]"
+    "file_pattern = file_pattern[0:2]"
    ]
   },
   {
@@ -278,6 +280,69 @@
     "dask.compute(tasks)"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Combine .json `Kerchunk` reference files and write a combined `Kerchunk` index\n",
+    "\n",
+    "Next, we are going to combine the created Kerchunk reference files into a single reference file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a list of reference json files\n",
+    "output_files = glob.glob(f\"{temp_dir}/*.json\")\n",
+    "\n",
+    "# combine individual references into single consolidated reference\n",
+    "mzz = MultiZarrToZarr(\n",
+    "    output_files,\n",
+    "    concat_dims=[\"Tijme\"],\n",
+    "    identical_dims=[\"XLAT\", \"XLON\", \"interp_levels\"],\n",
+    ")\n",
+    "# save translate reference in memory for later visualization\n",
+    "multi_kerchunk = mzz.translate()\n",
+    "\n",
+    "# Write kerchunk .json record\n",
+    "output_fname = \"combined.json\"\n",
+    "with open(f\"{output_fname}\", \"wb\") as f:\n",
+    "    f.write(ujson.dumps(multi_kerchunk).encode())"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load kerchunked dataset\n",
+    "\n",
+    "Now the dataset is a logical view over all of the files we scanned."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create an fsspec reference filesystem from the Kerchunk output\n",
+    "\n",
+    "fs = fsspec.filesystem(\n",
+    "    \"reference\",\n",
+    "    fo=\"combined.json\",\n",
+    "    remote_protocol=\"s3\",\n",
+    "    remote_options={\"anon\": True},\n",
+    "    skip_instance_cache=True,\n",
+    ")\n",
+    "m = fs.get_mapper(\"\")\n",
+    "ds = xr.open_dataset(m, engine=\"zarr\")"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -344,7 +409,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {