From 05e2cbd800e119573a5670505c4e3315733509cd Mon Sep 17 00:00:00 2001 From: Connor Lane Date: Fri, 4 Aug 2023 12:36:45 -0400 Subject: [PATCH 1/2] Flatten columns, column renaming The multi-index columns have been clumsy for some folks. Now return a dataframe with flat columns and a string prefix indicating the column group. It is still possible to convert to a hierarchical index, select a particular group, or drop the group level in post-processing. This usage is shown in the example. Also rename sidecar -> json following the suggestion in (#15). --- bids2table/__init__.py | 2 +- bids2table/_bids2table.py | 8 +- bids2table/extractors/_inheritance.py | 38 +- bids2table/extractors/bids.py | 19 +- bids2table/extractors/dataset.py | 2 +- .../extractors/{sidecar.py => metadata.py} | 7 +- bids2table/helpers.py | 13 - example/example.ipynb | 1411 +++++++++++++---- tests/test_extractors/test_dataset.py | 6 +- .../{test_sidecar.py => test_metadata.py} | 14 +- tests/test_main.py | 4 +- 11 files changed, 1121 insertions(+), 403 deletions(-) rename bids2table/extractors/{sidecar.py => metadata.py} (89%) rename tests/test_extractors/{test_sidecar.py => test_metadata.py} (83%) diff --git a/bids2table/__init__.py b/bids2table/__init__.py index 5ee2b96..5953e40 100644 --- a/bids2table/__init__.py +++ b/bids2table/__init__.py @@ -2,5 +2,5 @@ Efficiently index large-scale BIDS datasets and derivatives """ -from ._bids2table import bids2table, load_index # noqa +from ._bids2table import bids2table # noqa from ._version import __version__, __version_tuple__ # noqa diff --git a/bids2table/_bids2table.py b/bids2table/_bids2table.py index be251d9..6c12ff4 100644 --- a/bids2table/_bids2table.py +++ b/bids2table/_bids2table.py @@ -9,7 +9,6 @@ from elbow.utils import setup_logging from bids2table.extractors.bids import extract_bids_subdir -from bids2table.helpers import flat_to_multi_columns, load_index setup_logging() @@ -74,7 +73,7 @@ def bids2table( if output.exists() and not stale: if return_df: logging.info("Loading cached index %s", output) - df = load_index(output) + df = pd.read_parquet(output) else: logging.info("Found cached index %s; nothing to do", output) df = None @@ -83,7 +82,6 @@ def bids2table( if not persistent: logging.info("Building index in memory") df = build_table(source=source, extract=extract_bids_subdir) - df = flat_to_multi_columns(df) return df logging.info("Building persistent Parquet index") @@ -95,6 +93,8 @@ def bids2table( overwrite=overwrite, workers=workers, worker_id=worker_id, + path_column="file__file_path", + mtime_column="file__mod_time", ) - df = load_index(output) if return_df else None + df = pd.read_parquet(output) if return_df else None return df diff --git a/bids2table/extractors/_inheritance.py b/bids2table/extractors/_inheritance.py index 5d5b40c..263d0d8 100644 --- a/bids2table/extractors/_inheritance.py +++ b/bids2table/extractors/_inheritance.py @@ -10,20 +10,18 @@ def find_bids_parents( query: Dict[str, str], - root: StrOrPath, - depth: Optional[int] = 4, + start: StrOrPath, + depth: Optional[int] = None, ) -> Generator[str, None, None]: """ Find all BIDS files satisfying the inheritance principle requirements for the given - ``query`` entities dict. The ``query`` must contain at least one of ``'suffix'`` or - ``'ext'``. Search up the directory hierarchy at most ``depth`` levels, - starting from and including ``root``. Yields matching ``path``s in decreasing - topological order. - - The default depth of 4 is appropriate for directory structures of the form - ``{dataset}/sub-{sub}/ses-{ses}/{datatype}``. If depth is None, search all the way - up the tree. Note also that the search stops once a ``dataset_description.json`` is - found. + `query` entities dict. The `query` must contain at least one of `'suffix'` or + `'ext'`. Search up the directory hierarchy at most `depth` levels, starting from and + including `start`, or until a `dataset_description.json` file is found, indicating + the BIDS dataset root directory. If `depth` is `None`, the search may continue to + the filesystem root. + + Yields matching `path`s in decreasing topological order. """ suffix = query.get("suffix") ext = query.get("ext") @@ -31,15 +29,15 @@ def find_bids_parents( raise ValueError("At least one of 'suffix' or 'ext' are required in `query`.") pattern = f"*{suffix}{ext}" if suffix else f"*{ext}" - root = Path(root).absolute() - if not root.is_dir(): - root = root.parent + start = Path(start).absolute() + if not start.is_dir(): + start = start.parent if depth is None: - depth = len(root.parts) + depth = len(start.parts) for _ in range(depth): - for path in _glob(root, pattern): + for path in _glob(start, pattern): entities = parse_bids_entities(path) if _test_bids_match(query, entities): yield str(path) @@ -47,20 +45,20 @@ def find_bids_parents( # Stop climbing the directory if we find the description json, which should # always be at the top-level dataset directory. # TODO: for nested datasets, can you inherit beyond the first root? I hope not.. - if is_dataset_root(root): + if is_dataset_root(start): break - root = root.parent + start = start.parent def find_first_bids_parent( - query: Dict[str, str], root: StrOrPath, depth: int = 4 + query: Dict[str, str], start: StrOrPath, depth: Optional[int] = None ) -> Optional[str]: """ Find the first BIDS parent file matching the ``query`` entities dict. Returns ``None`` if no parents found. See :func:`find_bids_parents` for more details. """ - return next(find_bids_parents(query, root, depth), None) + return next(find_bids_parents(query, start, depth), None) @lru_cache() diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py index f126d62..da5363d 100644 --- a/bids2table/extractors/bids.py +++ b/bids2table/extractors/bids.py @@ -7,9 +7,9 @@ from elbow.record import Record, concat from elbow.typing import StrOrPath -from .dataset import extract_dataset_meta +from .dataset import extract_dataset from .entities import BIDSEntities -from .sidecar import extract_sidecar, is_associated_sidecar +from .metadata import extract_metadata, is_associated_sidecar def extract_bids_file(path: StrOrPath) -> Optional[Record]: @@ -27,18 +27,11 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]: ) return None - dset_meta = extract_dataset_meta(path) - bids_meta = extract_sidecar(path) - file_meta = extract_file_meta(path) + dset_rec = extract_dataset(path) + meta_rec = extract_metadata(path) + file_rec = extract_file_meta(path) - rec = concat( - { - "dataset": dset_meta, - "entities": entities, - "metadata": bids_meta, - "file": file_meta, - } - ) + rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec}) return rec diff --git a/bids2table/extractors/dataset.py b/bids2table/extractors/dataset.py index 66d1fb3..450a899 100644 --- a/bids2table/extractors/dataset.py +++ b/bids2table/extractors/dataset.py @@ -8,7 +8,7 @@ from elbow.typing import StrOrPath -def extract_dataset_meta(path: StrOrPath) -> Record: +def extract_dataset(path: StrOrPath) -> Record: """ Get info about the BIDS dataset that ``path`` belongs to. """ diff --git a/bids2table/extractors/sidecar.py b/bids2table/extractors/metadata.py similarity index 89% rename from bids2table/extractors/sidecar.py rename to bids2table/extractors/metadata.py index 1643cc5..a0efc86 100644 --- a/bids2table/extractors/sidecar.py +++ b/bids2table/extractors/metadata.py @@ -10,17 +10,16 @@ from .entities import parse_bids_entities -def extract_sidecar(path: StrOrPath) -> Record: +def extract_metadata(path: StrOrPath) -> Record: """ Load the JSON sidecar metadata associated with ``path``. Supports metadata inheritance by searching up the directory tree for matching JSON files. """ entities = parse_bids_entities(path) query = dict(entities, ext=".json") - root = Path(path).parent metadata = {} - sidecars = reversed(list(find_bids_parents(query, root=root))) + sidecars = reversed(list(find_bids_parents(query, start=Path(path).parent))) for path in sidecars: with open(path) as f: try: @@ -31,7 +30,7 @@ def extract_sidecar(path: StrOrPath) -> Record: ) # TODO: type aliases for json, pickle, etc so we can use a dataclass here. - rec = Record({"sidecar": metadata or None}, types={"sidecar": "json"}) + rec = Record({"json": metadata or None}, types={"json": "json"}) return rec diff --git a/bids2table/helpers.py b/bids2table/helpers.py index fa2b2c5..1b6e2eb 100644 --- a/bids2table/helpers.py +++ b/bids2table/helpers.py @@ -2,7 +2,6 @@ from typing import Any, Dict, Optional, Union import pandas as pd -from elbow.typing import StrOrPath from bids2table.extractors.entities import BIDSEntities @@ -70,15 +69,3 @@ def multi_to_flat_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame: df = df.copy(deep=False) df.columns = pd.Index(join_columns) return df - - -def load_index( - path: StrOrPath, split_columns: bool = True, sep: str = "__" -) -> pd.DataFrame: - """ - Load a bids2table index, optionally splitting columns into a multi index on `sep`. - """ - df = pd.read_parquet(path) - if split_columns: - df = flat_to_multi_columns(df, sep=sep) - return df diff --git a/example/example.ipynb b/example/example.ipynb index 086a147..5544473 100644 --- a/example/example.ipynb +++ b/example/example.ipynb @@ -6,11 +6,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Required to load columns with extension types\n", + "# Required to load columns with extension types, e.g. json type\n", "import elbow.dtypes\n", "import pandas as pd\n", "\n", - "from bids2table import bids2table" + "from bids2table import bids2table\n", + "from bids2table.helpers import flat_to_multi_columns" ] }, { @@ -44,10 +45,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "187it [00:03, 58.67it/s, tot=187, good=187, rec=2430, err=0]\n", - "194it [00:03, 60.82it/s, tot=194, good=194, rec=2493, err=0]\n", - "200it [00:03, 61.01it/s, tot=200, good=200, rec=2586, err=0]\n", - "199it [00:03, 56.17it/s, tot=199, good=199, rec=2757, err=0]\n" + "176it [00:01, 92.58it/s, tot=176, good=176, rec=2245, err=0] \n", + "197it [00:02, 90.73it/s, tot=197, good=197, rec=2663, err=0] \n", + "203it [00:02, 93.22it/s, tot=203, good=203, rec=2630, err=0]\n", + "204it [00:02, 92.01it/s, tot=204, good=204, rec=2728, err=0] \n" ] } ], @@ -76,11 +77,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 2096\n", - "-rw------- 1 clane staff 214K Jul 15 17:26 part-20230715172609-0002-of-0004.parquet\n", - "-rw------- 1 clane staff 198K Jul 15 17:26 part-20230715172609-0001-of-0004.parquet\n", - "-rw------- 1 clane staff 203K Jul 15 17:26 part-20230715172609-0000-of-0004.parquet\n", - "-rw------- 1 clane staff 205K Jul 15 17:26 part-20230715172609-0003-of-0004.parquet\n" + "total 1992\n", + "-rw------- 1 clane staff 248K Aug 4 12:34 part-20230804123438-0003-of-0004.parquet\n", + "-rw------- 1 clane staff 247K Aug 4 12:34 part-20230804123438-0002-of-0004.parquet\n", + "-rw------- 1 clane staff 175K Aug 4 12:34 part-20230804123438-0001-of-0004.parquet\n", + "-rw------- 1 clane staff 161K Aug 4 12:34 part-20230804123438-0000-of-0004.parquet\n" ] } ], @@ -99,16 +100,293 @@ "\n", "Each row in the table corresponds to a BIDS data file. The table is organized with several groups of columns:\n", "\n", - "- `dataset`: dataset name, relative dataset path, and the JSON dataset description\n", - "- `entities`: All [valid BIDS entities](https://bids-specification.readthedocs.io/en/stable/appendices/entities.html) plus an `extra_entities` dict containing any extra entities\n", - "- `metadata`: BIDS JSON \"sidecar\" metadata\n", - "- `file`: General file metadata including the full file path and last modified time." + "- dataset (`ds__*`): dataset name, relative dataset path, and the JSON dataset description\n", + "- entities (`ent__*`): All [valid BIDS entities](https://bids-specification.readthedocs.io/en/stable/appendices/entities.html) plus an `extra_entities` dict containing any extra entities\n", + "- metadata (`meta__*`): BIDS JSON metadata\n", + "- file (`file__*`): General file metadata including the full file path and last modified time" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ds__datasetds__dataset_typeds__dataset_pathds__dataset_descriptionent__subent__sesent__sampleent__taskent__acqent__ceent__trcent__stainent__recent__dirent__runent__modent__echoent__flipent__invent__mtent__partent__procent__hemient__spaceent__splitent__recordingent__chunkent__atlasent__resent__denent__labelent__descent__datatypeent__suffixent__extent__extra_entitiesmeta__jsonfile__file_pathfile__link_targetfile__mod_time
0asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneanatT1w.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
1asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfm0scan.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
2asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfasl.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
\n", + "
" + ], + "text/plain": [ + " ds__dataset ds__dataset_type \n", + "0 asl002 raw \\\n", + "1 asl002 raw \n", + "2 asl002 raw \n", + "\n", + " ds__dataset_path \n", + "0 /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", + "1 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "2 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "\n", + " ds__dataset_description ent__sub ent__ses \n", + "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \\\n", + "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \n", + "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \n", + "\n", + " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \n", + "0 None None None None None None None \\\n", + "1 None None None None None None None \n", + "2 None None None None None None None \n", + "\n", + " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \n", + "0 None NaN None NaN NaN NaN None \\\n", + "1 None NaN None NaN NaN NaN None \n", + "2 None NaN None NaN NaN NaN None \n", + "\n", + " ent__part ent__proc ent__hemi ent__space ent__split ent__recording \n", + "0 None None None None NaN None \\\n", + "1 None None None None NaN None \n", + "2 None None None None NaN None \n", + "\n", + " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc ent__datatype \n", + "0 NaN None None None None None anat \\\n", + "1 NaN None None None None None perf \n", + "2 NaN None None None None None perf \n", + "\n", + " ent__suffix ent__ext ent__extra_entities \n", + "0 T1w .nii.gz {} \\\n", + "1 m0scan .nii.gz {} \n", + "2 asl .nii.gz {} \n", + "\n", + " meta__json \n", + "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", + "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "\n", + " file__file_path file__link_target \n", + "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None \\\n", + "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", + "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", + "\n", + " file__mod_time \n", + "0 1.687883e+09 \n", + "1 1.687883e+09 \n", + "2 1.687883e+09 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bids2table(\"../bids-examples\")\n", + "\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also split the columns into a pandas [`MultiIndex`](https://pandas.pydata.org/docs/user_guide/advanced.html) using the helper function `flat_to_multi_columns()`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { @@ -129,14 +407,483 @@ "\n", "\n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetentitiesmetadatafile
dsentmetafile
datasetdataset_typedataset_pathdataset_descriptionsubsessampletaskacqcetrcstainrecdirrunmodechoflipinvmtpartprochemispacesplitrecordingchunkatlasresdenlabeldescdatatypesuffixextextra_entitiesjsonfile_pathlink_targetmod_time
0asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneanatT1w.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
1asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfm0scan.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
2asl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfasl.nii.gz{}{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
\n", + "" + ], + "text/plain": [ + " ds \n", + " dataset dataset_type dataset_path \n", + "0 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", + "1 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "2 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "\n", + " ent \n", + " dataset_description sub ses sample \n", + "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \\\n", + "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", + "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", + "\n", + " \n", + " task acq ce trc stain rec dir run mod echo flip inv mt \n", + "0 None None None None None None None NaN None NaN NaN NaN None \\\n", + "1 None None None None None None None NaN None NaN NaN NaN None \n", + "2 None None None None None None None NaN None NaN NaN NaN None \n", + "\n", + " \n", + " part proc hemi space split recording chunk atlas res den label desc \n", + "0 None None None None NaN None NaN None None None None None \\\n", + "1 None None None None NaN None NaN None None None None None \n", + "2 None None None None NaN None NaN None None None None None \n", + "\n", + " \n", + " datatype suffix ext extra_entities \n", + "0 anat T1w .nii.gz {} \\\n", + "1 perf m0scan .nii.gz {} \n", + "2 perf asl .nii.gz {} \n", + "\n", + " meta \n", + " json \n", + "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", + "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "\n", + " file \n", + " file_path link_target mod_time \n", + "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", + "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", + "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_multi = flat_to_multi_columns(df)\n", + "\n", + "df_multi.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This makes it easy to extract just a single group of columns, e.g. the BIDS entities." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subsessampletaskacqcetrcstainrecdirrunmodechoflipinvmtpartprochemispacesplitrecordingchunkatlasresdenlabeldescdatatypesuffixextextra_entities
0Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneanatT1w.nii.gz{}
1Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfm0scan.nii.gz{}
2Sub103NoneNoneNoneNoneNoneNoneNoneNoneNoneNaNNoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNoneperfasl.nii.gz{}
\n", + "
" + ], + "text/plain": [ + " sub ses sample task acq ce trc stain rec dir run mod \n", + "0 Sub103 None None None None None None None None None NaN None \\\n", + "1 Sub103 None None None None None None None None None NaN None \n", + "2 Sub103 None None None None None None None None None NaN None \n", + "\n", + " echo flip inv mt part proc hemi space split recording chunk \n", + "0 NaN NaN NaN None None None None None NaN None NaN \\\n", + "1 NaN NaN NaN None None None None None NaN None NaN \n", + "2 NaN NaN NaN None None None None None NaN None NaN \n", + "\n", + " atlas res den label desc datatype suffix ext extra_entities \n", + "0 None None None None None anat T1w .nii.gz {} \n", + "1 None None None None None perf m0scan .nii.gz {} \n", + "2 None None None None None perf asl .nii.gz {} " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entities = df_multi[\"ent\"]\n", + "\n", + "entities.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also drop the first level of the column multi-index for shorter column names." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", " \n", " \n", " \n", @@ -174,7 +921,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -183,11 +930,11 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -219,18 +966,18 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -258,32 +1005,32 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -301,11 +1048,11 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -315,52 +1062,51 @@ "" ], "text/plain": [ - " dataset \n", " dataset dataset_type dataset_path \n", - "0 ds002 None /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "1 ds002 None /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "2 ds002 None /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "0 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", + "1 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "2 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", "\n", - " entities \n", - " dataset_description sub ses sample \n", - "0 {'BIDSVersion': '1.0.0', 'License': 'This data... 14 None None \\\n", - "1 {'BIDSVersion': '1.0.0', 'License': 'This data... 14 None None \n", - "2 {'BIDSVersion': '1.0.0', 'License': 'This data... 14 None None \n", + " dataset_description sub ses sample \n", + "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \\\n", + "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", + "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", "\n", - " \n", - " task acq ce trc stain rec dir run mod echo \n", - "0 None None None None None None None NaN None NaN \\\n", - "1 None None None None None None None NaN None NaN \n", - "2 mixedeventrelatedprobe None None None None None None 1.0 None NaN \n", + " task acq ce trc stain rec dir run mod echo flip inv mt \n", + "0 None None None None None None None NaN None NaN NaN NaN None \\\n", + "1 None None None None None None None NaN None NaN NaN NaN None \n", + "2 None None None None None None None NaN None NaN NaN NaN None \n", "\n", - " \n", - " flip inv mt part proc hemi space split recording chunk atlas res \n", - "0 NaN NaN None None None None None NaN None NaN None None \\\n", - "1 NaN NaN None None None None None NaN None NaN None None \n", - "2 NaN NaN None None None None None NaN None NaN None None \n", + " part proc hemi space split recording chunk atlas res den label \n", + "0 None None None None NaN None NaN None None None None \\\n", + "1 None None None None NaN None NaN None None None None \n", + "2 None None None None NaN None NaN None None None None \n", "\n", - " metadata \n", - " den label desc datatype suffix ext extra_entities sidecar \n", - "0 None None None anat T1w .nii.gz {} None \\\n", - "1 None None None anat inplaneT2 .nii.gz {} None \n", - "2 None None None func events .tsv {} None \n", + " desc datatype suffix ext extra_entities \n", + "0 None anat T1w .nii.gz {} \\\n", + "1 None perf m0scan .nii.gz {} \n", + "2 None perf asl .nii.gz {} \n", + "\n", + " json \n", + "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", + "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", "\n", - " file \n", " file_path link_target mod_time \n", "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 " ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = bids2table(\"../bids-examples\")\n", + "df_drop = df_multi.droplevel(0, axis=1)\n", "\n", - "df.head(3)" + "df_drop.head(3)" ] }, { @@ -377,7 +1123,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -386,46 +1132,46 @@ "text": [ "Shape: (10266, 40)\n", "Columns:\n", - " ('dataset', 'dataset'): object\n", - " ('dataset', 'dataset_type'): object\n", - " ('dataset', 'dataset_path'): object\n", - " ('dataset', 'dataset_description'): json\n", - " ('entities', 'sub'): object\n", - " ('entities', 'ses'): object\n", - " ('entities', 'sample'): object\n", - " ('entities', 'task'): object\n", - " ('entities', 'acq'): object\n", - " ('entities', 'ce'): object\n", - " ('entities', 'trc'): object\n", - " ('entities', 'stain'): object\n", - " ('entities', 'rec'): object\n", - " ('entities', 'dir'): object\n", - " ('entities', 'run'): float64\n", - " ('entities', 'mod'): object\n", - " ('entities', 'echo'): float64\n", - " ('entities', 'flip'): float64\n", - " ('entities', 'inv'): float64\n", - " ('entities', 'mt'): object\n", - " ('entities', 'part'): object\n", - " ('entities', 'proc'): object\n", - " ('entities', 'hemi'): object\n", - " ('entities', 'space'): object\n", - " ('entities', 'split'): float64\n", - " ('entities', 'recording'): object\n", - " ('entities', 'chunk'): float64\n", - " ('entities', 'atlas'): object\n", - " ('entities', 'res'): object\n", - " ('entities', 'den'): object\n", - " ('entities', 'label'): object\n", - " ('entities', 'desc'): object\n", - " ('entities', 'datatype'): object\n", - " ('entities', 'suffix'): object\n", - " ('entities', 'ext'): object\n", - " ('entities', 'extra_entities'): json\n", - " ('metadata', 'sidecar'): json\n", - " ('file', 'file_path'): object\n", - " ('file', 'link_target'): object\n", - " ('file', 'mod_time'): float64\n" + " ds__dataset: object\n", + " ds__dataset_type: object\n", + " ds__dataset_path: object\n", + " ds__dataset_description: json\n", + " ent__sub: object\n", + " ent__ses: object\n", + " ent__sample: object\n", + " ent__task: object\n", + " ent__acq: object\n", + " ent__ce: object\n", + " ent__trc: object\n", + " ent__stain: object\n", + " ent__rec: object\n", + " ent__dir: object\n", + " ent__run: float64\n", + " ent__mod: object\n", + " ent__echo: float64\n", + " ent__flip: float64\n", + " ent__inv: float64\n", + " ent__mt: object\n", + " ent__part: object\n", + " ent__proc: object\n", + " ent__hemi: object\n", + " ent__space: object\n", + " ent__split: float64\n", + " ent__recording: object\n", + " ent__chunk: float64\n", + " ent__atlas: object\n", + " ent__res: object\n", + " ent__den: object\n", + " ent__label: object\n", + " ent__desc: object\n", + " ent__datatype: object\n", + " ent__suffix: object\n", + " ent__ext: object\n", + " ent__extra_entities: json\n", + " meta__json: json\n", + " file__file_path: object\n", + " file__link_target: object\n", + " file__mod_time: float64\n" ] } ], @@ -437,6 +1183,13 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most columns are numeric (`float64`) or string (`object`) type. However there are some columns (`ds__dataset_description`, `ent__extra_entities`, `meta__json`) which use the elbow extension `json` type for arbitrary nested dicts." + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -444,24 +1197,12 @@ "source": [ "### Sorting rows\n", "\n", - "By default the rows are in arbitrary order. We can sort the values in place.\n", - "\n", - "If you find the hierarchical index annoying, you can drop the top level with:\n", - "\n", - "```python\n", - "df = df.droplevel(0, axis=1)\n", - "```\n", - "\n", - "You can also select one group of columns with e.g.\n", - "\n", - "```python\n", - "ents = df[\"entities\"]\n", - "```" + "By default the rows are in arbitrary order. We can sort the values in place." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -477,66 +1218,59 @@ " vertical-align: top;\n", " }\n", "\n", - " .dataframe thead tr th {\n", - " text-align: left;\n", + " .dataframe thead th {\n", + " text-align: right;\n", " }\n", "\n", "
datasetdataset_typesuffixextextra_entitiessidecarjsonfile_pathlink_targetmod_time
0ds002Noneasl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'BIDSVersion': '1.0.0', 'License': 'This data...14{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneT1w.nii.gz{}None{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
1ds002Noneasl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'BIDSVersion': '1.0.0', 'License': 'This data...14{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNoneNoneNoneNoneNoneanatinplaneT2perfm0scan.nii.gz{}None{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
2ds002Noneasl002raw/Users/clane/Projects/ScalableQC/code/bids2tab...{'BIDSVersion': '1.0.0', 'License': 'This data...14{'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...Sub103NoneNonemixedeventrelatedprobeNoneNoneNoneNoneNoneNone1.0NoneNaNNoneNaNNaNNoneNoneNonefuncevents.tsvperfasl.nii.gz{}None{'Manufacturer': 'Philips', 'ManufacturersMode.../Users/clane/Projects/ScalableQC/code/bids2tab...None1.687883e+09
\n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -579,7 +1313,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -622,7 +1356,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -669,62 +1403,69 @@ "" ], "text/plain": [ - " dataset \n", - " dataset dataset_type dataset_path \n", - "6804 7t_trt None /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "6806 7t_trt None /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "6802 7t_trt None /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + " ds__dataset ds__dataset_type \n", + "9284 7t_trt None \\\n", + "9286 7t_trt None \n", + "9282 7t_trt None \n", "\n", - " entities \n", - " dataset_description sub ses sample task \n", - "6804 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 None rest \\\n", - "6806 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 None rest \n", - "6802 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 None rest \n", + " ds__dataset_path \n", + "9284 /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", + "9286 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", + "9282 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", "\n", - " \n", - " acq ce trc stain rec dir run mod echo flip inv mt \n", - "6804 fullbrain None None None None None 1.0 None NaN NaN NaN None \\\n", - "6806 fullbrain None None None None None 1.0 None NaN NaN NaN None \n", - "6802 fullbrain None None None None None 2.0 None NaN NaN NaN None \n", + " ds__dataset_description ent__sub ent__ses \n", + "9284 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \\\n", + "9286 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", + "9282 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", "\n", - " \n", - " part proc hemi space split recording chunk atlas res den label \n", - "6804 None None None None NaN None NaN None None None None \\\n", - "6806 None None None None NaN None NaN None None None None \n", - "6802 None None None None NaN None NaN None None None None \n", + " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \n", + "9284 None rest fullbrain None None None None \\\n", + "9286 None rest fullbrain None None None None \n", + "9282 None rest fullbrain None None None None \n", "\n", - " \n", - " desc datatype suffix ext extra_entities \n", - "6804 None func bold .nii.gz {} \\\n", - "6806 None func physio .tsv.gz {} \n", - "6802 None func bold .nii.gz {} \n", + " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \n", + "9284 None 1.0 None NaN NaN NaN None \\\n", + "9286 None 1.0 None NaN NaN NaN None \n", + "9282 None 2.0 None NaN NaN NaN None \n", "\n", - " metadata \n", - " sidecar \n", - "6804 {'CogAtlasID': 'https://www.cognitiveatlas.org... \\\n", - "6806 {'StartTime': 0, 'SamplingFrequency': 100, 'Co... \n", - "6802 {'CogAtlasID': 'https://www.cognitiveatlas.org... \n", + " ent__part ent__proc ent__hemi ent__space ent__split ent__recording \n", + "9284 None None None None NaN None \\\n", + "9286 None None None None NaN None \n", + "9282 None None None None NaN None \n", "\n", - " file \n", - " file_path link_target \n", - "6804 /Users/clane/Projects/ScalableQC/code/bids2tab... None \\\n", - "6806 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", - "6802 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", + " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc \n", + "9284 NaN None None None None None \\\n", + "9286 NaN None None None None None \n", + "9282 NaN None None None None None \n", "\n", - " \n", - " mod_time \n", - "6804 1.687883e+09 \n", - "6806 1.687883e+09 \n", - "6802 1.687883e+09 " + " ent__datatype ent__suffix ent__ext ent__extra_entities \n", + "9284 func bold .nii.gz {} \\\n", + "9286 func physio .tsv.gz {} \n", + "9282 func bold .nii.gz {} \n", + "\n", + " meta__json \n", + "9284 {'CogAtlasID': 'https://www.cognitiveatlas.org... \\\n", + "9286 {'StartTime': 0, 'SamplingFrequency': 100, 'Co... \n", + "9282 {'CogAtlasID': 'https://www.cognitiveatlas.org... \n", + "\n", + " file__file_path file__link_target \n", + "9284 /Users/clane/Projects/ScalableQC/code/bids2tab... None \\\n", + "9286 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", + "9282 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", + "\n", + " file__mod_time \n", + "9284 1.687883e+09 \n", + "9286 1.687883e+09 \n", + "9282 1.687883e+09 " ] }, - "execution_count": 7, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sort_cols = [(\"dataset\", \"dataset\")] + [(\"entities\", k) for k in [\"sub\", \"ses\", \"task\", \"run\"]]\n", + "sort_cols = [\"ds__dataset\"] + [f\"ent__{k}\" for k in [\"sub\", \"ses\", \"task\", \"run\"]]\n", "\n", "df.sort_values(sort_cols, inplace=True)\n", "\n", @@ -743,7 +1484,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -784,13 +1525,13 @@ "dtype: int64" ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ent_counts = df[\"entities\"].count(axis=0)\n", + "ent_counts = entities.count(axis=0)\n", "ent_counts" ] }, @@ -804,7 +1545,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -818,7 +1559,7 @@ "dtype: int64" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -834,12 +1575,12 @@ "source": [ "### File counts\n", "\n", - "Count the number of data files per dataset and the number of files with sidecar metadata." + "Count the number of data files per dataset and the number of files with json metadata." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -864,7 +1605,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1096,7 +1837,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1106,7 +1847,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1328,108 +2069,108 @@ "" ], "text/plain": [ - " file_path sidecar\n", - "dataset \n", - "7t_trt 635 350\n", - "asl001 4 2\n", - "asl002 5 3\n", - "asl003 5 3\n", - "asl004 6 4\n", - "asl005 5 3\n", - "ds000001-fmriprep 416 52\n", - "ds000117 1105 657\n", - "ds000246 33 23\n", - "ds000247 105 75\n", - "ds000248 13 6\n", - "ds001 128 48\n", - "ds002 238 102\n", - "ds003 52 13\n", - "ds004332 58 58\n", - "ds005 128 48\n", - "ds006 384 164\n", - "ds007 276 118\n", - "ds008 197 84\n", - "ds009 360 144\n", - "ds011 196 84\n", - "ds051 245 102\n", - "ds052 142 52\n", - "ds101 105 42\n", - "ds102 130 52\n", - "ds105 148 71\n", - "ds107 245 98\n", - "ds108 442 204\n", - "ds109 174 69\n", - "ds110 396 180\n", - "ds113b 312 156\n", - "ds114 160 100\n", - "ds116 238 102\n", - "ds210 300 300\n", - "eeg_cbm 80 20\n", - "eeg_ds000117 336 224\n", - "eeg_ds003645s_hed 18 12\n", - "eeg_ds003645s_hed_inheritance 14 12\n", - "eeg_ds003645s_hed_library 18 12\n", - "eeg_ds003645s_hed_longform 18 12\n", - "eeg_face13 50 20\n", - "eeg_matchingpennies 42 35\n", - "eeg_rest_fmri 45 45\n", - "eeg_rishikesh 120 80\n", - "fnirs_automaticity 672 192\n", - "fnirs_tapping 25 5\n", - "genetics_ukbb 112 42\n", - "hcp_example_bids 5 3\n", - "ieeg_epilepsy 21 10\n", - "ieeg_epilepsy/derivatives/brainvisa 10 0\n", - "ieeg_epilepsyNWB 15 4\n", - "ieeg_epilepsyNWB/derivatives/brainvisa 10 0\n", - "ieeg_epilepsy_ecog 18 4\n", - "ieeg_filtered_speech 56 35\n", - "ieeg_motorMiller2007 121 57\n", - "ieeg_visual 22 13\n", - "ieeg_visual_multimodal 102 88\n", - "micr_SEM 5 5\n", - "micr_SEMzarr 3 3\n", - "micr_SPIM 10 10\n", - "motion_dualtask 355 195\n", - "motion_spotrotation 110 65\n", - "motion_systemvalidation 27 12\n", - "pet001 4 4\n", - "pet002 8 8\n", - "pet003 3 2\n", - "pet004 3 3\n", - "pet005 5 5\n", - "qmri_irt1 4 4\n", - "qmri_irt1/derivatives/qMRLab 2 2\n", - "qmri_megre 8 8\n", - "qmri_mese 32 32\n", - "qmri_mese/derivatives/qMRLab 3 3\n", - "qmri_mp2rage 5 4\n", - "qmri_mp2rage/derivatives/pymp2rage 2 2\n", - "qmri_mp2rageme 11 11\n", - "qmri_mp2rageme/derivatives/pymp2rage 4 4\n", - "qmri_mpm 53 53\n", - "qmri_mpm/derivatives/hmri 8 8\n", - "qmri_mtsat 5 5\n", - "qmri_mtsat/derivatives/qMRLab 5 5\n", - "qmri_qsm 2 2\n", - "qmri_qsm/derivatives/qMRLab 1 1\n", - "qmri_sa2rage 2 2\n", - "qmri_sa2rage/derivatives/sa2rage 1 1\n", - "qmri_tb1tfl 2 2\n", - "qmri_vfa 4 4\n", - "qmri_vfa/derivatives/qMRLab 3 3\n", - "synthetic 110 80\n", - "synthetic/derivatives/fmriprep 150 60" + " file_path json\n", + "dataset \n", + "7t_trt 635 350\n", + "asl001 4 2\n", + "asl002 5 3\n", + "asl003 5 3\n", + "asl004 6 4\n", + "asl005 5 3\n", + "ds000001-fmriprep 416 52\n", + "ds000117 1105 657\n", + "ds000246 33 23\n", + "ds000247 105 75\n", + "ds000248 13 6\n", + "ds001 128 48\n", + "ds002 238 102\n", + "ds003 52 13\n", + "ds004332 58 58\n", + "ds005 128 48\n", + "ds006 384 164\n", + "ds007 276 118\n", + "ds008 197 84\n", + "ds009 360 144\n", + "ds011 196 84\n", + "ds051 245 102\n", + "ds052 142 52\n", + "ds101 105 42\n", + "ds102 130 52\n", + "ds105 148 71\n", + "ds107 245 98\n", + "ds108 442 204\n", + "ds109 174 69\n", + "ds110 396 180\n", + "ds113b 312 156\n", + "ds114 160 100\n", + "ds116 238 102\n", + "ds210 300 300\n", + "eeg_cbm 80 20\n", + "eeg_ds000117 336 224\n", + "eeg_ds003645s_hed 18 12\n", + "eeg_ds003645s_hed_inheritance 14 12\n", + "eeg_ds003645s_hed_library 18 12\n", + "eeg_ds003645s_hed_longform 18 12\n", + "eeg_face13 50 20\n", + "eeg_matchingpennies 42 35\n", + "eeg_rest_fmri 45 45\n", + "eeg_rishikesh 120 80\n", + "fnirs_automaticity 672 384\n", + "fnirs_tapping 25 5\n", + "genetics_ukbb 112 70\n", + "hcp_example_bids 5 3\n", + "ieeg_epilepsy 21 10\n", + "ieeg_epilepsy/derivatives/brainvisa 10 0\n", + "ieeg_epilepsyNWB 15 4\n", + "ieeg_epilepsyNWB/derivatives/brainvisa 10 0\n", + "ieeg_epilepsy_ecog 18 4\n", + "ieeg_filtered_speech 56 35\n", + "ieeg_motorMiller2007 121 57\n", + "ieeg_visual 22 13\n", + "ieeg_visual_multimodal 102 88\n", + "micr_SEM 5 5\n", + "micr_SEMzarr 3 3\n", + "micr_SPIM 10 10\n", + "motion_dualtask 355 195\n", + "motion_spotrotation 110 65\n", + "motion_systemvalidation 27 12\n", + "pet001 4 4\n", + "pet002 8 8\n", + "pet003 3 2\n", + "pet004 3 3\n", + "pet005 5 5\n", + "qmri_irt1 4 4\n", + "qmri_irt1/derivatives/qMRLab 2 2\n", + "qmri_megre 8 8\n", + "qmri_mese 32 32\n", + "qmri_mese/derivatives/qMRLab 3 3\n", + "qmri_mp2rage 5 4\n", + "qmri_mp2rage/derivatives/pymp2rage 2 2\n", + "qmri_mp2rageme 11 11\n", + "qmri_mp2rageme/derivatives/pymp2rage 4 4\n", + "qmri_mpm 53 53\n", + "qmri_mpm/derivatives/hmri 8 8\n", + "qmri_mtsat 5 5\n", + "qmri_mtsat/derivatives/qMRLab 5 5\n", + "qmri_qsm 2 2\n", + "qmri_qsm/derivatives/qMRLab 1 1\n", + "qmri_sa2rage 2 2\n", + "qmri_sa2rage/derivatives/sa2rage 1 1\n", + "qmri_tb1tfl 2 2\n", + "qmri_vfa 4 4\n", + "qmri_vfa/derivatives/qMRLab 3 3\n", + "synthetic 110 80\n", + "synthetic/derivatives/fmriprep 150 60" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.droplevel(0, axis=1).groupby(\"dataset\").agg(\n", - " {\"file_path\": \"count\", \"sidecar\": \"count\"}\n", + "df_drop.groupby(\"dataset\").agg(\n", + " {\"file_path\": \"count\", \"json\": \"count\"}\n", ")" ] }, @@ -1445,7 +2186,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1492,17 +2233,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "194it [00:02, 69.81it/s, tot=194, good=194, rec=2493, err=0]\n", - "187it [00:02, 66.42it/s, tot=187, good=187, rec=2430, err=0]\n", - "200it [00:02, 67.12it/s, tot=200, good=200, rec=2586, err=0]\n", - "199it [00:03, 63.59it/s, tot=199, good=199, rec=2757, err=0]\n" + "176it [00:01, 89.47it/s, tot=176, good=176, rec=2245, err=0] \n", + "203it [00:02, 90.18it/s, tot=203, good=203, rec=2630, err=0]]\n", + "197it [00:02, 87.01it/s, tot=197, good=197, rec=2663, err=0]\n", + "204it [00:02, 88.66it/s, tot=204, good=204, rec=2728, err=0] \n" ] } ], diff --git a/tests/test_extractors/test_dataset.py b/tests/test_extractors/test_dataset.py index 2e98f40..e48cced 100644 --- a/tests/test_extractors/test_dataset.py +++ b/tests/test_extractors/test_dataset.py @@ -3,7 +3,7 @@ import pytest -from bids2table.extractors.dataset import extract_dataset_meta +from bids2table.extractors.dataset import extract_dataset @pytest.fixture @@ -22,8 +22,8 @@ def bids_dataset(tmp_path: Path) -> Path: return ds_dir -def test_extract_dataset_meta(bids_dataset: Path): - dataset_meta = extract_dataset_meta(bids_dataset) +def test_extract_dataset(bids_dataset: Path): + dataset_meta = extract_dataset(bids_dataset) assert dataset_meta["dataset"] == "dummy_dataset" assert dataset_meta["dataset_path"] == str(bids_dataset) assert dataset_meta["dataset_description"]["Name"] == "Dummy dataset" diff --git a/tests/test_extractors/test_sidecar.py b/tests/test_extractors/test_metadata.py similarity index 83% rename from tests/test_extractors/test_sidecar.py rename to tests/test_extractors/test_metadata.py index b39c352..043ecde 100644 --- a/tests/test_extractors/test_sidecar.py +++ b/tests/test_extractors/test_metadata.py @@ -3,7 +3,7 @@ import pytest -from bids2table.extractors.sidecar import extract_sidecar, is_associated_sidecar +from bids2table.extractors.metadata import extract_metadata, is_associated_sidecar @pytest.fixture @@ -49,14 +49,14 @@ def bids_dataset(tmp_path: Path): json.dump({"E": True}, f) # Inherited from first json but not second - expected_sidecar = {"A": True, "B": True, "C": True} - return ds_dir, image_path, expected_sidecar + expected_json = {"A": True, "B": True, "C": True} + return ds_dir, image_path, expected_json -def test_extract_sidecar(bids_dataset): - _, image_path, expected_sidecar = bids_dataset - rec = extract_sidecar(image_path) - assert rec["sidecar"] == expected_sidecar +def test_extract_metadata(bids_dataset): + _, image_path, expected_json = bids_dataset + rec = extract_metadata(image_path) + assert rec["json"] == expected_json @pytest.mark.parametrize( diff --git a/tests/test_main.py b/tests/test_main.py index 131925e..b4df3f4 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -3,10 +3,10 @@ from pathlib import Path from typing import List +import pandas as pd import pytest from bids2table import __main__ as cli -from bids2table import load_index BIDS_EXAMPLES = Path(__file__).parent.parent / "bids-examples" @@ -36,7 +36,7 @@ def test_main(tmp_path: Path): with patch_argv(argv): cli.main() - df = load_index(output) + df = pd.read_parquet(output) assert df.shape == (128, 40) From c97e550bb73b5c347dabf2d29d675acef3275e0d Mon Sep 17 00:00:00 2001 From: Connor Lane Date: Fri, 4 Aug 2023 12:55:07 -0400 Subject: [PATCH 2/2] Add test for flat <-> multi column conversion --- tests/test_helpers.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index b30c713..8515650 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -3,7 +3,11 @@ import pandas as pd import pytest -from bids2table.helpers import join_bids_path +from bids2table.helpers import ( + flat_to_multi_columns, + join_bids_path, + multi_to_flat_columns, +) @pytest.mark.parametrize( @@ -89,5 +93,24 @@ def test_join_bids_path( assert str(path) == expected +@pytest.mark.parametrize("sep", ["__", "."]) +def test_flat_to_multi_columns(sep: str): + df = pd.DataFrame( + { + f"A{sep}a": [1, 2, 3], + f"A{sep}b": ["a", "b", "c"], + f"B{sep}a": [4, 5, 6], + f"B{sep}b": ["d", "e", "f"], + } + ) + multi_index = pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]) + + df_multi = flat_to_multi_columns(df, sep=sep) + assert df_multi.columns.equals(multi_index) + + df_flat = multi_to_flat_columns(df_multi, sep=sep) + assert df_flat.equals(df) + + if __name__ == "__main__": pytest.main([__file__])
datasetentitiesmetadatafile
datasetdataset_typedataset_pathdataset_descriptionsubsessampletaskacqcetrcstainrecdirrunmodechoflipinvmtpartprochemispacesplitrecordingchunkatlasresdenlabeldescdatatypesuffixextextra_entitiessidecarfile_pathlink_targetmod_timeds__datasetds__dataset_typeds__dataset_pathds__dataset_descriptionent__subent__sesent__sampleent__taskent__acqent__ceent__trcent__stainent__recent__dirent__runent__modent__echoent__flipent__invent__mtent__partent__procent__hemient__spaceent__splitent__recordingent__chunkent__atlasent__resent__denent__labelent__descent__datatypeent__suffixent__extent__extra_entitiesmeta__jsonfile__file_pathfile__link_targetfile__mod_time
680492847t_trtNone/Users/clane/Projects/ScalableQC/code/bids2tab...1.687883e+09
680692867t_trtNone/Users/clane/Projects/ScalableQC/code/bids2tab...1.687883e+09
680292827t_trtNone/Users/clane/Projects/ScalableQC/code/bids2tab...
file_pathsidecarjson
dataset
fnirs_automaticity672192384
fnirs_tapping
genetics_ukbb1124270
hcp_example_bids