diff --git a/bids2table/__main__.py b/bids2table/__main__.py index fc6e7bd..d8791b3 100644 --- a/bids2table/__main__.py +++ b/bids2table/__main__.py @@ -47,11 +47,18 @@ def main(): "Incompatible with --overwrite. (default: None)", default=None, ) - parser.add_argument("--verbose", "-v", help="Verbose logging.", action="store_true") + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="Increase verbosity level.", + ) args = parser.parse_args() - setup_logging(level="INFO" if args.verbose else "WARNING") + log_level = ["ERROR", "WARNING", "INFO"][min(args.verbose, 2)] + setup_logging(level=log_level) bids2table( root=args.root, diff --git a/bids2table/_b2t.py b/bids2table/_b2t.py index f0f2bc1..1749245 100644 --- a/bids2table/_b2t.py +++ b/bids2table/_b2t.py @@ -1,4 +1,5 @@ import logging +from functools import partial from pathlib import Path from typing import Optional @@ -15,6 +16,7 @@ def bids2table( root: StrOrPath, *, + with_meta: bool = True, persistent: bool = False, index_path: Optional[StrOrPath] = None, incremental: bool = False, @@ -28,6 +30,8 @@ def bids2table( Args: root: path to BIDS dataset + with_meta: extract JSON sidecar metadata. Excluding metadata can result in much + faster indexing. persistent: whether to save index to disk as a Parquet dataset index_path: path to BIDS Parquet index to generate or load. Defaults to `root / "index.b2t"`. Index generation requires `persistent=True`. @@ -60,6 +64,7 @@ def bids2table( dirs_only=True, follow_links=True, ) + extract = partial(extract_bids_subdir, with_meta=with_meta) if index_path is None: index_path = root / "index.b2t" @@ -80,7 +85,7 @@ def bids2table( logger.info("Building index in memory") df = build_table( source=source, - extract=extract_bids_subdir, + extract=extract, workers=workers, worker_id=worker_id, ) @@ -90,7 +95,7 @@ def bids2table( logger.info("Building persistent Parquet index") build_parquet( source=source, - extract=extract_bids_subdir, + extract=extract, output=index_path, incremental=incremental, overwrite=overwrite, diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py index e7b4719..40e0fbb 100644 --- a/bids2table/extractors/bids.py +++ b/bids2table/extractors/bids.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -def extract_bids_file(path: StrOrPath) -> Optional[Record]: +def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Record]: """ Extract BIDS entities and metadata from a data file in a BIDS dataset. """ @@ -31,19 +31,24 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]: return None dset_rec = extract_dataset(path) - meta_rec = extract_metadata(path) + if with_meta: + meta_rec = extract_metadata(path) + else: + meta_rec = Record({"json": None}, types={"json": "json"}) file_rec = extract_file_meta(path) rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec}) return rec -def extract_bids_subdir(path: StrOrPath) -> Generator[Optional[Record], None, None]: +def extract_bids_subdir( + path: StrOrPath, with_meta: bool = True +) -> Generator[Optional[Record], None, None]: """ Extract BIDS records recursively for all files in a sub-directory. """ for path in iglob(str(Path(path) / "**"), recursive=True): - yield extract_bids_file(path) + yield extract_bids_file(path, with_meta=with_meta) def is_bids_file(path: StrOrPath) -> bool: diff --git a/bids2table/table.py b/bids2table/table.py index d4066eb..14b4678 100644 --- a/bids2table/table.py +++ b/bids2table/table.py @@ -6,6 +6,7 @@ import pandas as pd from bids2table.entities import ENTITY_NAMES_TO_KEYS, BIDSEntities +from bids2table.extractors.metadata import extract_metadata class BIDSTable(pd.DataFrame): @@ -269,6 +270,16 @@ def add_prefix(k: str): return self return out + def with_meta(self, inplace: bool = False) -> "BIDSTable": + """ + Returns a new BIDS table complete with JSON sidecar metadata. + """ + out = self if inplace else self.copy() + file_paths = out.finfo["file_path"] + meta_json = file_paths.apply(lambda path: extract_metadata(path)["json"]) + out.loc[:, "meta__json"] = meta_json + return out + @classmethod def from_df(cls, df: pd.DataFrame) -> "BIDSTable": """ diff --git a/example/example.ipynb b/example/example.ipynb index d269258..82fac1e 100644 --- a/example/example.ipynb +++ b/example/example.ipynb @@ -6,6 +6,8 @@ "metadata": {}, "outputs": [], "source": [ + "import json\n", + "\n", "import pandas as pd\n", "\n", "from bids2table import bids2table" @@ -42,10 +44,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "193it [00:00, 318.09it/s, tot=193, good=193, rec=2386, err=0]\n", - "172it [00:00, 288.23it/s, tot=172, good=172, rec=2240, err=0]\n", - "202it [00:00, 287.97it/s, tot=202, good=202, rec=2828, err=0]\n", - "213it [00:00, 300.22it/s, tot=213, good=213, rec=2812, err=0]\n" + "193it [00:00, 308.38it/s, tot=193, good=193, rec=2386, err=0]\n", + "172it [00:00, 284.34it/s, tot=172, good=172, rec=2240, err=0]\n", + "202it [00:00, 284.34it/s, tot=202, good=202, rec=2828, err=0]\n", + "213it [00:00, 295.75it/s, tot=213, good=213, rec=2812, err=0]\n" ] } ], @@ -82,10 +84,10 @@ "output_type": "stream", "text": [ "total 1608\n", - "-rw------- 1 clane staff 197K Aug 9 06:17 part-20230809061750-0002-of-0004.parquet\n", - "-rw------- 1 clane staff 240K Aug 9 06:17 part-20230809061750-0003-of-0004.parquet\n", - "-rw------- 1 clane staff 167K Aug 9 06:17 part-20230809061750-0000-of-0004.parquet\n", - "-rw------- 1 clane staff 194K Aug 9 06:17 part-20230809061750-0001-of-0004.parquet\n" + "-rw-------@ 1 clane staff 197K May 1 16:00 part-20240501160029-0002-of-0004.parquet\n", + "-rw-------@ 1 clane staff 240K May 1 16:00 part-20240501160029-0003-of-0004.parquet\n", + "-rw-------@ 1 clane staff 167K May 1 16:00 part-20240501160029-0000-of-0004.parquet\n", + "-rw-------@ 1 clane staff 194K May 1 16:00 part-20240501160029-0001-of-0004.parquet\n" ] } ], @@ -2635,6 +2637,28 @@ "filtered" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-01/func/sub-04_ses-01_task-rest_space-T1w_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-T1w_desc-preproc_bold.nii\n", + "synthetic/sub-04/ses-01/func/sub-04_ses-01_task-rest_bold.nii\n", + "synthetic/sub-04/ses-02/func/sub-04_ses-02_task-rest_bold.nii\n" + ] + } + ], + "source": [ + "print(\"\\n\".join(sorted([str(f.path.relative_to(\"/Users/clane/Projects/B2T/bids2table/bids-examples/\")) for f in filtered.files])))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2644,7 +2668,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -2676,7 +2700,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2695,7 +2719,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2716,6 +2740,177 @@ "print(\"File paths:\\n\", \"\\n\".join([str(f.relative_path) for f in files]), sep=\"\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Skipping metadata\n", + "\n", + "Extracting JSON sidecar metadata can often be the most time-consuming step of the indexing process. By setting `with_meta=False`, `bidstable` can skip this expensive up-front processing. Here we index without metadata and get a small speedup. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "780it [00:02, 319.48it/s, tot=780, good=780, rec=10266, err=0]\n" + ] + } + ], + "source": [ + "tab_no_meta = bids2table(root=\"../bids-examples\", with_meta=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to extract metadata for a subset of the files after the fact, you can use the `BIDSTable.with_meta` method." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | TaskName | \n", + "Manufacturer | \n", + "ManufacturersModelName | \n", + "ImageType | \n", + "AcquisitionTime | \n", + "AcquisitionDate | \n", + "MagneticFieldStrength | \n", + "FlipAngle | \n", + "EchoTime | \n", + "RepetitionTime | \n", + "EffectiveEchoSpacing | \n", + "SliceTiming | \n", + "PhaseEncodingDirection | \n", + "CogAtlasID | \n", + "SliceEncodingDirection | \n", + "StartTime | \n", + "SamplingFrequency | \n", + "Columns | \n", + "Sources | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
913 | \n", + "Rest | \n", + "Siemens | \n", + "Skyra | \n", + "[ORIGINAL, PRIMARY, M, MB, ND, MOSAI] | \n", + "192106.68 | \n", + "20180511.0 | \n", + "3.0 | \n", + "51.0 | \n", + "0.0424 | \n", + "0.735 | \n", + "0.00064 | \n", + "[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375... | \n", + "j- | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
993 | \n", + "Rest | \n", + "Siemens | \n", + "Skyra | \n", + "[ORIGINAL, PRIMARY, M, MB, ND, MOSAI] | \n", + "192106.68 | \n", + "20180511.0 | \n", + "3.0 | \n", + "51.0 | \n", + "0.0424 | \n", + "0.735 | \n", + "0.00064 | \n", + "[0, 0.09, 0.18, 0.2675, 0.3575, 0.4475, 0.5375... | \n", + "j- | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "