From b22f036b18eae2e0a485707b9f783d8e8d3bf696 Mon Sep 17 00:00:00 2001 From: Connor Lane Date: Wed, 1 May 2024 16:14:21 -0400 Subject: [PATCH] Make extracting sidecar metadata optional (#33) * Make extracting sidecar metadata optional. In datasets with large sidecar json metadata, extracting metadata can take up >90% of run time. * Add option to extract sidecar metadata after table --- bids2table/__main__.py | 11 +- bids2table/_b2t.py | 9 +- bids2table/extractors/bids.py | 13 +- bids2table/table.py | 11 ++ example/example.ipynb | 243 ++++++++++++++++++++++++++++++---- tests/test_bids2table.py | 17 ++- tests/test_table.py | 18 +++ 7 files changed, 286 insertions(+), 36 deletions(-) diff --git a/bids2table/__main__.py b/bids2table/__main__.py index fc6e7bd..d8791b3 100644 --- a/bids2table/__main__.py +++ b/bids2table/__main__.py @@ -47,11 +47,18 @@ def main(): "Incompatible with --overwrite. (default: None)", default=None, ) - parser.add_argument("--verbose", "-v", help="Verbose logging.", action="store_true") + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="Increase verbosity level.", + ) args = parser.parse_args() - setup_logging(level="INFO" if args.verbose else "WARNING") + log_level = ["ERROR", "WARNING", "INFO"][min(args.verbose, 2)] + setup_logging(level=log_level) bids2table( root=args.root, diff --git a/bids2table/_b2t.py b/bids2table/_b2t.py index f0f2bc1..1749245 100644 --- a/bids2table/_b2t.py +++ b/bids2table/_b2t.py @@ -1,4 +1,5 @@ import logging +from functools import partial from pathlib import Path from typing import Optional @@ -15,6 +16,7 @@ def bids2table( root: StrOrPath, *, + with_meta: bool = True, persistent: bool = False, index_path: Optional[StrOrPath] = None, incremental: bool = False, @@ -28,6 +30,8 @@ def bids2table( Args: root: path to BIDS dataset + with_meta: extract JSON sidecar metadata. Excluding metadata can result in much + faster indexing. persistent: whether to save index to disk as a Parquet dataset index_path: path to BIDS Parquet index to generate or load. Defaults to `root / "index.b2t"`. Index generation requires `persistent=True`. @@ -60,6 +64,7 @@ def bids2table( dirs_only=True, follow_links=True, ) + extract = partial(extract_bids_subdir, with_meta=with_meta) if index_path is None: index_path = root / "index.b2t" @@ -80,7 +85,7 @@ def bids2table( logger.info("Building index in memory") df = build_table( source=source, - extract=extract_bids_subdir, + extract=extract, workers=workers, worker_id=worker_id, ) @@ -90,7 +95,7 @@ def bids2table( logger.info("Building persistent Parquet index") build_parquet( source=source, - extract=extract_bids_subdir, + extract=extract, output=index_path, incremental=incremental, overwrite=overwrite, diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py index e7b4719..40e0fbb 100644 --- a/bids2table/extractors/bids.py +++ b/bids2table/extractors/bids.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -def extract_bids_file(path: StrOrPath) -> Optional[Record]: +def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Record]: """ Extract BIDS entities and metadata from a data file in a BIDS dataset. """ @@ -31,19 +31,24 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]: return None dset_rec = extract_dataset(path) - meta_rec = extract_metadata(path) + if with_meta: + meta_rec = extract_metadata(path) + else: + meta_rec = Record({"json": None}, types={"json": "json"}) file_rec = extract_file_meta(path) rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec}) return rec -def extract_bids_subdir(path: StrOrPath) -> Generator[Optional[Record], None, None]: +def extract_bids_subdir( + path: StrOrPath, with_meta: bool = True +) -> Generator[Optional[Record], None, None]: """ Extract BIDS records recursively for all files in a sub-directory. """ for path in iglob(str(Path(path) / "**"), recursive=True): - yield extract_bids_file(path) + yield extract_bids_file(path, with_meta=with_meta) def is_bids_file(path: StrOrPath) -> bool: diff --git a/bids2table/table.py b/bids2table/table.py index d4066eb..14b4678 100644 --- a/bids2table/table.py +++ b/bids2table/table.py @@ -6,6 +6,7 @@ import pandas as pd from bids2table.entities import ENTITY_NAMES_TO_KEYS, BIDSEntities +from bids2table.extractors.metadata import extract_metadata class BIDSTable(pd.DataFrame): @@ -269,6 +270,16 @@ def add_prefix(k: str): return self return out + def with_meta(self, inplace: bool = False) -> "BIDSTable": + """ + Returns a new BIDS table complete with JSON sidecar metadata. + """ + out = self if inplace else self.copy() + file_paths = out.finfo["file_path"] + meta_json = file_paths.apply(lambda path: extract_metadata(path)["json"]) + out.loc[:, "meta__json"] = meta_json + return out + @classmethod def from_df(cls, df: pd.DataFrame) -> "BIDSTable": """ diff --git a/example/example.ipynb b/example/example.ipynb index d269258..82fac1e 100644 --- a/example/example.ipynb +++ b/example/example.ipynb @@ -6,6 +6,8 @@ "metadata": {}, "outputs": [], "source": [ + "import json\n", + "\n", "import pandas as pd\n", "\n", "from bids2table import bids2table" @@ -42,10 +44,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "193it [00:00, 318.09it/s, tot=193, good=193, rec=2386, err=0]\n", - "172it [00:00, 288.23it/s, tot=172, good=172, rec=2240, err=0]\n", - "202it [00:00, 287.97it/s, tot=202, good=202, rec=2828, err=0]\n", - "213it [00:00, 300.22it/s, tot=213, good=213, rec=2812, err=0]\n" + "193it [00:00, 308.38it/s, tot=193, good=193, rec=2386, err=0]\n", + "172it [00:00, 284.34it/s, tot=172, good=172, rec=2240, err=0]\n", + "202it [00:00, 284.34it/s, tot=202, good=202, rec=2828, err=0]\n", + "213it [00:00, 295.75it/s, tot=213, good=213, rec=2812, err=0]\n" ] } ], @@ -82,10 +84,10 @@ "output_type": "stream", "text": [ "total 1608\n", - "-rw------- 1 clane staff 197K Aug 9 06:17 part-20230809061750-0002-of-0004.parquet\n", - "-rw------- 1 clane staff 240K Aug 9 06:17 part-20230809061750-0003-of-0004.parquet\n", - "-rw------- 1 clane staff 167K Aug 9 06:17 part-20230809061750-0000-of-0004.parquet\n", - "-rw------- 1 clane staff 194K Aug 9 06:17 part-20230809061750-0001-of-0004.parquet\n" + "-rw-------@ 1 clane staff 197K May 1 16:00 part-20240501160029-0002-of-0004.parquet\n", + "-rw-------@ 1 clane staff 240K May 1 16:00 part-20240501160029-0003-of-0004.parquet\n", + "-rw-------@ 1 clane staff 167K May 1 16:00 part-20240501160029-0000-of-0004.parquet\n", + "-rw-------@ 1 clane staff 194K May 1 16:00 part-20240501160029-0001-of-0004.parquet\n" ] } ], @@ -2635,6 +2637,28 @@ "filtered" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-T1w_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-T1w_desc-preproc_bold.nii\n", + "synthetic/sub-04/ses-01/func/sub-04_ses-01_task-rest_bold.nii\n", + "synthetic/sub-04/ses-02/func/sub-04_ses-02_task-rest_bold.nii\n" + ] + } + ], + "source": [ + "print(\"\\n\".join(sorted([str(f.path.relative_to(\"/Users/clane/Projects/B2T/bids2table/bids-examples/\")) for f in filtered.files])))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2644,7 +2668,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -2676,7 +2700,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2695,7 +2719,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2716,6 +2740,177 @@ "print(\"File paths:\\n\", \"\\n\".join([str(f.relative_path) for f in files]), sep=\"\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Skipping metadata\n", + "\n", + "Extracting JSON sidecar metadata can often be the most time-consuming step of the indexing process. By setting `with_meta=False`, `bidstable` can skip this expensive up-front processing. Here we index without metadata and get a small speedup. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "780it [00:02, 319.48it/s, tot=780, good=780, rec=10266, err=0]\n" + ] + } + ], + "source": [ + "tab_no_meta = bids2table(root=\"../bids-examples\", with_meta=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to extract metadata for a subset of the files after the fact, you can use the `BIDSTable.with_meta` method." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskNameManufacturerManufacturersModelNameImageTypeAcquisitionTimeAcquisitionDateMagneticFieldStrengthFlipAngleEchoTimeRepetitionTimeEffectiveEchoSpacingSliceTimingPhaseEncodingDirectionCogAtlasIDSliceEncodingDirectionStartTimeSamplingFrequencyColumnsSources
913RestSiemensSkyra[ORIGINAL, PRIMARY, M, MB, ND, MOSAI]192106.6820180511.03.051.00.04240.7350.00064[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...j-NaNNaNNaNNaNNaNNaN
993RestSiemensSkyra[ORIGINAL, PRIMARY, M, MB, ND, MOSAI]192106.6820180511.03.051.00.04240.7350.00064[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375...j-NaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " TaskName Manufacturer ManufacturersModelName \\\n", + "913 Rest Siemens Skyra \n", + "993 Rest Siemens Skyra \n", + "\n", + " ImageType AcquisitionTime AcquisitionDate \\\n", + "913 [ORIGINAL, PRIMARY, M, MB, ND, MOSAI] 192106.68 20180511.0 \n", + "993 [ORIGINAL, PRIMARY, M, MB, ND, MOSAI] 192106.68 20180511.0 \n", + "\n", + " MagneticFieldStrength FlipAngle EchoTime RepetitionTime \\\n", + "913 3.0 51.0 0.0424 0.735 \n", + "993 3.0 51.0 0.0424 0.735 \n", + "\n", + " EffectiveEchoSpacing SliceTiming \\\n", + "913 0.00064 [0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375... \n", + "993 0.00064 [0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375... \n", + "\n", + " PhaseEncodingDirection CogAtlasID SliceEncodingDirection StartTime \\\n", + "913 j- NaN NaN NaN \n", + "993 j- NaN NaN NaN \n", + "\n", + " SamplingFrequency Columns Sources \n", + "913 NaN NaN NaN \n", + "993 NaN NaN NaN " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_no_meta = (\n", + " tab_no_meta\n", + " .filter(\"task\", contains=\"rest\")\n", + " .filter(\"sub\", items=[\"04\", \"08\"])\n", + ")\n", + "\n", + "filtered_with_meta = filtered_no_meta.with_meta()\n", + "filtered_with_meta.flat_meta.head(2)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2737,7 +2932,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2778,7 +2973,7 @@ "dtype: int64" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2798,7 +2993,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -2812,7 +3007,7 @@ "dtype: int64" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2833,7 +3028,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -3416,7 +3611,7 @@ "synthetic/derivatives/fmriprep 150 60" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -3439,7 +3634,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -3468,7 +3663,7 @@ " Optional worker ID to use when scheduling parallel\n", " tasks externally. Incompatible with --overwrite.\n", " (default: None)\n", - " --verbose, -v Verbose logging.\n" + " --verbose, -v Increase verbosity level.\n" ] } ], @@ -3486,17 +3681,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "172it [00:00, 327.64it/s, tot=172, good=172, rec=2240, err=0]\n", - "193it [00:00, 349.64it/s, tot=193, good=193, rec=2386, err=0]\n", - "213it [00:00, 333.32it/s, tot=213, good=213, rec=2812, err=0]\n", - "202it [00:00, 315.47it/s, tot=202, good=202, rec=2828, err=0]\n" + "172it [00:00, 296.08it/s, tot=172, good=172, rec=2240, err=0]\n", + "193it [00:00, 314.65it/s, tot=193, good=193, rec=2386, err=0]\n", + "202it [00:00, 288.84it/s, tot=202, good=202, rec=2828, err=0]\n", + "213it [00:00, 301.57it/s, tot=213, good=213, rec=2812, err=0]\n" ] } ], diff --git a/tests/test_bids2table.py b/tests/test_bids2table.py index 8ef1607..4f58642 100644 --- a/tests/test_bids2table.py +++ b/tests/test_bids2table.py @@ -25,16 +25,25 @@ def empty_dataset(tmp_path: Path) -> Path: return root -@pytest.mark.parametrize("persistent", [False, True]) -def test_bids2table(tmp_path: Path, persistent: bool): +@pytest.mark.parametrize( + "persistent,with_meta", [(False, True), (True, True), (False, False)] +) +def test_bids2table(tmp_path: Path, persistent: bool, with_meta: bool): root = BIDS_EXAMPLES / "ds001" index_path = tmp_path / "index.b2t" - tab = bids2table(root=root, persistent=persistent, index_path=index_path) + tab = bids2table( + root=root, with_meta=with_meta, persistent=persistent, index_path=index_path + ) assert tab.shape == (128, 40) + if not with_meta: + assert tab.loc[0, "meta__json"] is None + # Reload from cache - tab2 = bids2table(root=root, persistent=persistent, index_path=index_path) + tab2 = bids2table( + root=root, with_meta=with_meta, persistent=persistent, index_path=index_path + ) assert tab.equals(tab2) diff --git a/tests/test_table.py b/tests/test_table.py index 5a665ea..6b9a0a6 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -23,6 +23,14 @@ def tab() -> BIDSTable: return tab +@pytest.fixture(scope="module") +def tab_no_meta() -> BIDSTable: + tab = bids2table(BIDS_EXAMPLES / "ds001", with_meta=False) + # sort rows to get deterministic order + tab = tab.sort_values("finfo__file_path", ignore_index=True) + return tab + + def test_table(tab: BIDSTable): assert tab.shape == (128, 40) @@ -116,6 +124,16 @@ def test_table_sort_entities(tab: BIDSTable, by: Union[str, List[str]], inplace: assert sort_tab.subjects == sorted(tab.subjects) +def test_table_with_meta(tab_no_meta: BIDSTable): + tab_no_meta = tab_no_meta.copy() + tab_with_meta = tab_no_meta.with_meta(inplace=False) + assert tab_no_meta["meta__json"].isna().all() + assert not tab_with_meta["meta__json"].isna().all() + + tab_with_meta = tab_no_meta.with_meta(inplace=True) + assert not tab_no_meta["meta__json"].isna().all() + + @pytest.mark.parametrize("sep", ["__", "."]) def test_flat_to_multi_columns(sep: str): df = pd.DataFrame(