From b22f036b18eae2e0a485707b9f783d8e8d3bf696 Mon Sep 17 00:00:00 2001
From: Connor Lane <connor.lane858@gmail.com>
Date: Wed, 1 May 2024 16:14:21 -0400
Subject: [PATCH] Make extracting sidecar metadata optional (#33)

* Make extracting sidecar metadata optional.

In datasets with large sidecar json metadata, extracting metadata can
take up >90% of run time.

* Add option to extract sidecar metadata after table
---
 bids2table/__main__.py        |  11 +-
 bids2table/_b2t.py            |   9 +-
 bids2table/extractors/bids.py |  13 +-
 bids2table/table.py           |  11 ++
 example/example.ipynb         | 243 ++++++++++++++++++++++++++++++----
 tests/test_bids2table.py      |  17 ++-
 tests/test_table.py           |  18 +++
 7 files changed, 286 insertions(+), 36 deletions(-)

diff --git a/bids2table/__main__.py b/bids2table/__main__.py
index fc6e7bd..d8791b3 100644
--- a/bids2table/__main__.py
+++ b/bids2table/__main__.py
@@ -47,11 +47,18 @@ def main():
         "Incompatible with --overwrite. (default: None)",
         default=None,
     )
-    parser.add_argument("--verbose", "-v", help="Verbose logging.", action="store_true")
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="count",
+        default=0,
+        help="Increase verbosity level.",
+    )
 
     args = parser.parse_args()
 
-    setup_logging(level="INFO" if args.verbose else "WARNING")
+    log_level = ["ERROR", "WARNING", "INFO"][min(args.verbose, 2)]
+    setup_logging(level=log_level)
 
     bids2table(
         root=args.root,
diff --git a/bids2table/_b2t.py b/bids2table/_b2t.py
index f0f2bc1..1749245 100644
--- a/bids2table/_b2t.py
+++ b/bids2table/_b2t.py
@@ -1,4 +1,5 @@
 import logging
+from functools import partial
 from pathlib import Path
 from typing import Optional
 
@@ -15,6 +16,7 @@
 def bids2table(
     root: StrOrPath,
     *,
+    with_meta: bool = True,
     persistent: bool = False,
     index_path: Optional[StrOrPath] = None,
     incremental: bool = False,
@@ -28,6 +30,8 @@ def bids2table(
 
     Args:
         root: path to BIDS dataset
+        with_meta: extract JSON sidecar metadata. Excluding metadata can result in much
+            faster indexing.
         persistent: whether to save index to disk as a Parquet dataset
         index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
             "index.b2t"`. Index generation requires `persistent=True`.
@@ -60,6 +64,7 @@ def bids2table(
         dirs_only=True,
         follow_links=True,
     )
+    extract = partial(extract_bids_subdir, with_meta=with_meta)
 
     if index_path is None:
         index_path = root / "index.b2t"
@@ -80,7 +85,7 @@ def bids2table(
         logger.info("Building index in memory")
         df = build_table(
             source=source,
-            extract=extract_bids_subdir,
+            extract=extract,
             workers=workers,
             worker_id=worker_id,
         )
@@ -90,7 +95,7 @@ def bids2table(
     logger.info("Building persistent Parquet index")
     build_parquet(
         source=source,
-        extract=extract_bids_subdir,
+        extract=extract,
         output=index_path,
         incremental=incremental,
         overwrite=overwrite,
diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py
index e7b4719..40e0fbb 100644
--- a/bids2table/extractors/bids.py
+++ b/bids2table/extractors/bids.py
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 
-def extract_bids_file(path: StrOrPath) -> Optional[Record]:
+def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Record]:
     """
     Extract BIDS entities and metadata from a data file in a BIDS dataset.
     """
@@ -31,19 +31,24 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
         return None
 
     dset_rec = extract_dataset(path)
-    meta_rec = extract_metadata(path)
+    if with_meta:
+        meta_rec = extract_metadata(path)
+    else:
+        meta_rec = Record({"json": None}, types={"json": "json"})
     file_rec = extract_file_meta(path)
 
     rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
     return rec
 
 
-def extract_bids_subdir(path: StrOrPath) -> Generator[Optional[Record], None, None]:
+def extract_bids_subdir(
+    path: StrOrPath, with_meta: bool = True
+) -> Generator[Optional[Record], None, None]:
     """
     Extract BIDS records recursively for all files in a sub-directory.
     """
     for path in iglob(str(Path(path) / "**"), recursive=True):
-        yield extract_bids_file(path)
+        yield extract_bids_file(path, with_meta=with_meta)
 
 
 def is_bids_file(path: StrOrPath) -> bool:
diff --git a/bids2table/table.py b/bids2table/table.py
index d4066eb..14b4678 100644
--- a/bids2table/table.py
+++ b/bids2table/table.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from bids2table.entities import ENTITY_NAMES_TO_KEYS, BIDSEntities
+from bids2table.extractors.metadata import extract_metadata
 
 
 class BIDSTable(pd.DataFrame):
@@ -269,6 +270,16 @@ def add_prefix(k: str):
             return self
         return out
 
+    def with_meta(self, inplace: bool = False) -> "BIDSTable":
+        """
+        Returns a new BIDS table complete with JSON sidecar metadata.
+        """
+        out = self if inplace else self.copy()
+        file_paths = out.finfo["file_path"]
+        meta_json = file_paths.apply(lambda path: extract_metadata(path)["json"])
+        out.loc[:, "meta__json"] = meta_json
+        return out
+
     @classmethod
     def from_df(cls, df: pd.DataFrame) -> "BIDSTable":
         """
diff --git a/example/example.ipynb b/example/example.ipynb
index d269258..82fac1e 100644
--- a/example/example.ipynb
+++ b/example/example.ipynb
@@ -6,6 +6,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import json\n",
+    "\n",
     "import pandas as pd\n",
     "\n",
     "from bids2table import bids2table"
@@ -42,10 +44,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "193it [00:00, 318.09it/s, tot=193, good=193, rec=2386, err=0]\n",
-      "172it [00:00, 288.23it/s, tot=172, good=172, rec=2240, err=0]\n",
-      "202it [00:00, 287.97it/s, tot=202, good=202, rec=2828, err=0]\n",
-      "213it [00:00, 300.22it/s, tot=213, good=213, rec=2812, err=0]\n"
+      "193it [00:00, 308.38it/s, tot=193, good=193, rec=2386, err=0]\n",
+      "172it [00:00, 284.34it/s, tot=172, good=172, rec=2240, err=0]\n",
+      "202it [00:00, 284.34it/s, tot=202, good=202, rec=2828, err=0]\n",
+      "213it [00:00, 295.75it/s, tot=213, good=213, rec=2812, err=0]\n"
      ]
     }
    ],
@@ -82,10 +84,10 @@
      "output_type": "stream",
      "text": [
       "total 1608\n",
-      "-rw-------  1 clane  staff   197K Aug  9 06:17 part-20230809061750-0002-of-0004.parquet\n",
-      "-rw-------  1 clane  staff   240K Aug  9 06:17 part-20230809061750-0003-of-0004.parquet\n",
-      "-rw-------  1 clane  staff   167K Aug  9 06:17 part-20230809061750-0000-of-0004.parquet\n",
-      "-rw-------  1 clane  staff   194K Aug  9 06:17 part-20230809061750-0001-of-0004.parquet\n"
+      "-rw-------@ 1 clane  staff   197K May  1 16:00 part-20240501160029-0002-of-0004.parquet\n",
+      "-rw-------@ 1 clane  staff   240K May  1 16:00 part-20240501160029-0003-of-0004.parquet\n",
+      "-rw-------@ 1 clane  staff   167K May  1 16:00 part-20240501160029-0000-of-0004.parquet\n",
+      "-rw-------@ 1 clane  staff   194K May  1 16:00 part-20240501160029-0001-of-0004.parquet\n"
      ]
     }
    ],
@@ -2635,6 +2637,28 @@
     "filtered"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n",
+      "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-T1w_desc-preproc_bold.nii\n",
+      "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n",
+      "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-T1w_desc-preproc_bold.nii\n",
+      "synthetic/sub-04/ses-01/func/sub-04_ses-01_task-rest_bold.nii\n",
+      "synthetic/sub-04/ses-02/func/sub-04_ses-02_task-rest_bold.nii\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n\".join(sorted([str(f.path.relative_to(\"/Users/clane/Projects/B2T/bids2table/bids-examples/\")) for f in filtered.files])))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2644,7 +2668,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -2676,7 +2700,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -2695,7 +2719,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -2716,6 +2740,177 @@
     "print(\"File paths:\\n\", \"\\n\".join([str(f.relative_path) for f in files]), sep=\"\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Skipping metadata\n",
+    "\n",
+    "Extracting JSON sidecar metadata can often be the most time-consuming step of the indexing process. By setting `with_meta=False`, `bidstable` can skip this expensive up-front processing. Here we index without metadata and get a small speedup. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "780it [00:02, 319.48it/s, tot=780, good=780, rec=10266, err=0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tab_no_meta = bids2table(root=\"../bids-examples\", with_meta=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you want to extract metadata for a subset of the files after the fact, you can use the `BIDSTable.with_meta` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>TaskName</th>\n",
+       "      <th>Manufacturer</th>\n",
+       "      <th>ManufacturersModelName</th>\n",
+       "      <th>ImageType</th>\n",
+       "      <th>AcquisitionTime</th>\n",
+       "      <th>AcquisitionDate</th>\n",
+       "      <th>MagneticFieldStrength</th>\n",
+       "      <th>FlipAngle</th>\n",
+       "      <th>EchoTime</th>\n",
+       "      <th>RepetitionTime</th>\n",
+       "      <th>EffectiveEchoSpacing</th>\n",
+       "      <th>SliceTiming</th>\n",
+       "      <th>PhaseEncodingDirection</th>\n",
+       "      <th>CogAtlasID</th>\n",
+       "      <th>SliceEncodingDirection</th>\n",
+       "      <th>StartTime</th>\n",
+       "      <th>SamplingFrequency</th>\n",
+       "      <th>Columns</th>\n",
+       "      <th>Sources</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>913</th>\n",
+       "      <td>Rest</td>\n",
+       "      <td>Siemens</td>\n",
+       "      <td>Skyra</td>\n",
+       "      <td>[ORIGINAL, PRIMARY, M, MB, ND, MOSAI]</td>\n",
+       "      <td>192106.68</td>\n",
+       "      <td>20180511.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>0.0424</td>\n",
+       "      <td>0.735</td>\n",
+       "      <td>0.00064</td>\n",
+       "      <td>[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...</td>\n",
+       "      <td>j-</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>993</th>\n",
+       "      <td>Rest</td>\n",
+       "      <td>Siemens</td>\n",
+       "      <td>Skyra</td>\n",
+       "      <td>[ORIGINAL, PRIMARY, M, MB, ND, MOSAI]</td>\n",
+       "      <td>192106.68</td>\n",
+       "      <td>20180511.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>51.0</td>\n",
+       "      <td>0.0424</td>\n",
+       "      <td>0.735</td>\n",
+       "      <td>0.00064</td>\n",
+       "      <td>[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...</td>\n",
+       "      <td>j-</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    TaskName Manufacturer ManufacturersModelName  \\\n",
+       "913     Rest      Siemens                  Skyra   \n",
+       "993     Rest      Siemens                  Skyra   \n",
+       "\n",
+       "                                 ImageType  AcquisitionTime  AcquisitionDate  \\\n",
+       "913  [ORIGINAL, PRIMARY, M, MB, ND, MOSAI]        192106.68       20180511.0   \n",
+       "993  [ORIGINAL, PRIMARY, M, MB, ND, MOSAI]        192106.68       20180511.0   \n",
+       "\n",
+       "     MagneticFieldStrength  FlipAngle  EchoTime  RepetitionTime  \\\n",
+       "913                    3.0       51.0    0.0424           0.735   \n",
+       "993                    3.0       51.0    0.0424           0.735   \n",
+       "\n",
+       "     EffectiveEchoSpacing                                        SliceTiming  \\\n",
+       "913               0.00064  [0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...   \n",
+       "993               0.00064  [0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...   \n",
+       "\n",
+       "    PhaseEncodingDirection CogAtlasID SliceEncodingDirection  StartTime  \\\n",
+       "913                     j-        NaN                    NaN        NaN   \n",
+       "993                     j-        NaN                    NaN        NaN   \n",
+       "\n",
+       "     SamplingFrequency Columns Sources  \n",
+       "913                NaN     NaN     NaN  \n",
+       "993                NaN     NaN     NaN  "
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_no_meta = (\n",
+    "    tab_no_meta\n",
+    "    .filter(\"task\", contains=\"rest\")\n",
+    "    .filter(\"sub\", items=[\"04\", \"08\"])\n",
+    ")\n",
+    "\n",
+    "filtered_with_meta = filtered_no_meta.with_meta()\n",
+    "filtered_with_meta.flat_meta.head(2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2737,7 +2932,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -2778,7 +2973,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2798,7 +2993,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -2812,7 +3007,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2833,7 +3028,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -3416,7 +3611,7 @@
        "synthetic/derivatives/fmriprep                150    60"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3439,7 +3634,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -3468,7 +3663,7 @@
       "                        Optional worker ID to use when scheduling parallel\n",
       "                        tasks externally. Incompatible with --overwrite.\n",
       "                        (default: None)\n",
-      "  --verbose, -v         Verbose logging.\n"
+      "  --verbose, -v         Increase verbosity level.\n"
      ]
     }
    ],
@@ -3486,17 +3681,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "172it [00:00, 327.64it/s, tot=172, good=172, rec=2240, err=0]\n",
-      "193it [00:00, 349.64it/s, tot=193, good=193, rec=2386, err=0]\n",
-      "213it [00:00, 333.32it/s, tot=213, good=213, rec=2812, err=0]\n",
-      "202it [00:00, 315.47it/s, tot=202, good=202, rec=2828, err=0]\n"
+      "172it [00:00, 296.08it/s, tot=172, good=172, rec=2240, err=0]\n",
+      "193it [00:00, 314.65it/s, tot=193, good=193, rec=2386, err=0]\n",
+      "202it [00:00, 288.84it/s, tot=202, good=202, rec=2828, err=0]\n",
+      "213it [00:00, 301.57it/s, tot=213, good=213, rec=2812, err=0]\n"
      ]
     }
    ],
diff --git a/tests/test_bids2table.py b/tests/test_bids2table.py
index 8ef1607..4f58642 100644
--- a/tests/test_bids2table.py
+++ b/tests/test_bids2table.py
@@ -25,16 +25,25 @@ def empty_dataset(tmp_path: Path) -> Path:
     return root
 
 
-@pytest.mark.parametrize("persistent", [False, True])
-def test_bids2table(tmp_path: Path, persistent: bool):
+@pytest.mark.parametrize(
+    "persistent,with_meta", [(False, True), (True, True), (False, False)]
+)
+def test_bids2table(tmp_path: Path, persistent: bool, with_meta: bool):
     root = BIDS_EXAMPLES / "ds001"
     index_path = tmp_path / "index.b2t"
 
-    tab = bids2table(root=root, persistent=persistent, index_path=index_path)
+    tab = bids2table(
+        root=root, with_meta=with_meta, persistent=persistent, index_path=index_path
+    )
     assert tab.shape == (128, 40)
 
+    if not with_meta:
+        assert tab.loc[0, "meta__json"] is None
+
     # Reload from cache
-    tab2 = bids2table(root=root, persistent=persistent, index_path=index_path)
+    tab2 = bids2table(
+        root=root, with_meta=with_meta, persistent=persistent, index_path=index_path
+    )
     assert tab.equals(tab2)
 
 
diff --git a/tests/test_table.py b/tests/test_table.py
index 5a665ea..6b9a0a6 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -23,6 +23,14 @@ def tab() -> BIDSTable:
     return tab
 
 
+@pytest.fixture(scope="module")
+def tab_no_meta() -> BIDSTable:
+    tab = bids2table(BIDS_EXAMPLES / "ds001", with_meta=False)
+    # sort rows to get deterministic order
+    tab = tab.sort_values("finfo__file_path", ignore_index=True)
+    return tab
+
+
 def test_table(tab: BIDSTable):
     assert tab.shape == (128, 40)
 
@@ -116,6 +124,16 @@ def test_table_sort_entities(tab: BIDSTable, by: Union[str, List[str]], inplace:
     assert sort_tab.subjects == sorted(tab.subjects)
 
 
+def test_table_with_meta(tab_no_meta: BIDSTable):
+    tab_no_meta = tab_no_meta.copy()
+    tab_with_meta = tab_no_meta.with_meta(inplace=False)
+    assert tab_no_meta["meta__json"].isna().all()
+    assert not tab_with_meta["meta__json"].isna().all()
+
+    tab_with_meta = tab_no_meta.with_meta(inplace=True)
+    assert not tab_no_meta["meta__json"].isna().all()
+
+
 @pytest.mark.parametrize("sep", ["__", "."])
 def test_flat_to_multi_columns(sep: str):
     df = pd.DataFrame(