From 918fcf45ffe6a69c5068b71465a01a83ae779010 Mon Sep 17 00:00:00 2001 From: Connor Lane Date: Wed, 9 Aug 2023 06:30:10 -0400 Subject: [PATCH] Higher-level API (#17) Add a higher level `BIDSTable` interface inspired by the [proposed PyBIDS API redesign](https://github.com/bids-standard/pybids/issues/989). * Move entities module one level up * Move `join_bids_path()` helper into `entities` * Add `BIDSTable` subclass of `DataFrame` Add `BIDSTable` subclass of `DataFrame` with convenience methods for accessing subtables and flattened metadata. * Add long names to entities field metadata * Add table `filter()` method Add `BIDSTable.filter()` which filters rows according to a condition applied to a single column. The supported conditions follow `pandas.Series.filter()`. * Add `files` property returning list of `BIDSFile`s Also change `file` column group to `finfo` to try to limit possible confusion. * Update example and bug fixes Bug fixes: - Set the index of `flat_metadata` to the parent table's index. - Treat NA in the row mask as False in `filter()`. * Add properties for subjects, datatypes, etc * Add `sort_entities()` * Upgrade required python to >=3.8 * Add `filter_multi` method and documentation PyBIDS supports querying a layout with multiple filters specified as keyword arguments. This is a nice interface, and is also useful for programmatic filtering. Here we add a `filter_multi()` method to do something similar. * Flatten JSON metadata only to first level * Fix mypy error * Move some things around * Add `func` arg to `filter()` Add a `func` arg option to `filter` for arbitrary lambda function filtering. Also move `join_bids_path()` into the `table` module. * More moving around * Don't use `removeprefix` * Yet more moving around * Update example * Add a comment on the filter api * Change arg name output -> index_path Having the argument be `output` in `bids2table` was confusing when you only want to load a table. --- .github/workflows/ci.yaml | 4 +- .gitignore | 3 +- bids2table/__init__.py | 18 +- bids2table/__main__.py | 4 +- bids2table/{_bids2table.py => _b2t.py} | 51 +- bids2table/{extractors => }/entities.py | 109 +- bids2table/extractors/__init__.py | 3 + bids2table/extractors/bids.py | 5 +- bids2table/extractors/image.py | 12 +- .../{_inheritance.py => inheritance.py} | 3 +- bids2table/extractors/metadata.py | 5 +- bids2table/helpers.py | 71 - bids2table/table.py | 434 ++++ example/example.ipynb | 2197 +++++++++++++---- pyproject.toml | 2 +- tests/test_bids2table.py | 18 +- tests/{test_extractors => }/test_entities.py | 2 +- tests/test_helpers.py | 116 - tests/test_table.py | 207 ++ 19 files changed, 2521 insertions(+), 743 deletions(-) rename bids2table/{_bids2table.py => _b2t.py} (67%) rename bids2table/{extractors => }/entities.py (68%) rename bids2table/extractors/{_inheritance.py => inheritance.py} (97%) delete mode 100644 bids2table/helpers.py create mode 100644 bids2table/table.py rename tests/{test_extractors => }/test_entities.py (97%) delete mode 100644 tests/test_helpers.py create mode 100644 tests/test_table.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d65a6ea..1334db3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,10 +13,10 @@ jobs: - uses: actions/checkout@v3 with: submodules: 'true' - - name: Set up Python 3.7 + - name: Set up Python 3.8 uses: actions/setup-python@v3 with: - python-version: "3.7" + python-version: "3.8" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.gitignore b/.gitignore index 02be247..2a92624 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,9 @@ htmlcov .vscode/ .env -# Local scratch +# Local data and scratch .scratch +example/bids-examples.b2t # Local environment .venv diff --git a/bids2table/__init__.py b/bids2table/__init__.py index 5953e40..41986b1 100644 --- a/bids2table/__init__.py +++ b/bids2table/__init__.py @@ -1,6 +1,20 @@ """ -Efficiently index large-scale BIDS datasets and derivatives +Efficiently index and query large-scale BIDS datasets and derivatives. """ -from ._bids2table import bids2table # noqa +# Register elbow extension types +import elbow.dtypes # noqa + +from ._b2t import bids2table from ._version import __version__, __version_tuple__ # noqa +from .entities import BIDSEntities, parse_bids_entities +from .table import BIDSFile, BIDSTable, join_bids_path + +__all__ = [ + "bids2table", + "BIDSTable", + "BIDSFile", + "BIDSEntities", + "parse_bids_entities", + "join_bids_path", +] diff --git a/bids2table/__main__.py b/bids2table/__main__.py index ca83aad..d1e3e3e 100644 --- a/bids2table/__main__.py +++ b/bids2table/__main__.py @@ -57,12 +57,12 @@ def main(): bids2table( root=args.root, persistent=True, - output=args.output, + index_path=args.output, incremental=args.incremental, overwrite=args.overwrite, workers=args.workers, worker_id=args.worker_id, - return_df=False, + return_table=False, ) diff --git a/bids2table/_bids2table.py b/bids2table/_b2t.py similarity index 67% rename from bids2table/_bids2table.py rename to bids2table/_b2t.py index 6c12ff4..43be2c4 100644 --- a/bids2table/_bids2table.py +++ b/bids2table/_b2t.py @@ -2,13 +2,13 @@ from pathlib import Path from typing import Optional -import pandas as pd from elbow.builders import build_parquet, build_table from elbow.sources.filesystem import Crawler from elbow.typing import StrOrPath from elbow.utils import setup_logging from bids2table.extractors.bids import extract_bids_subdir +from bids2table.table import BIDSTable setup_logging() @@ -17,21 +17,21 @@ def bids2table( root: StrOrPath, *, persistent: bool = False, - output: Optional[StrOrPath] = None, + index_path: Optional[StrOrPath] = None, incremental: bool = False, overwrite: bool = False, workers: Optional[int] = None, worker_id: Optional[int] = None, - return_df: bool = True, -) -> Optional[pd.DataFrame]: + return_table: bool = True, +) -> Optional[BIDSTable]: """ Index a BIDS dataset directory and load as a pandas DataFrame. Args: root: path to BIDS dataset persistent: whether to save index to disk as a Parquet dataset - output: path to output Parquet dataset directory if `persistent` is - `True`. Defaults to `root / "index.b2t". + index_path: path to BIDS Parquet index to generate or load. Defaults to `root / + "index.b2t"`. Index generation requires `persistent=True`. incremental: update index incrementally with only new or changed files. overwrite: overwrite previous index. workers: number of parallel processes. If `None` or 1, run in the main @@ -40,17 +40,19 @@ def bids2table( worker_id: optional worker ID to use when scheduling parallel tasks externally. Specifying the number of workers is required in this case. Incompatible with overwrite. - return_df: whether to return the dataframe or just build the persistent index. + return_table: whether to return the BIDS table or just build the persistent + index. Returns: - A DataFrame containing the BIDS Index. + A `BIDSTable` representing the indexed dataset(s), or `None` if `return_table` + is `False`. """ if worker_id is not None and not persistent: raise ValueError( "worker_id is only supported when generating a persistent index" ) - if not (return_df or persistent): - raise ValueError("persistent and return_df should not both be False") + if not (return_table or persistent): + raise ValueError("persistent and return_table should not both be False") root = Path(root).expanduser().resolve() if not root.is_dir(): @@ -64,31 +66,32 @@ def bids2table( follow_links=True, ) - if output is None: - output = root / "index.b2t" + if index_path is None: + index_path = root / "index.b2t" else: - output = Path(output).expanduser().resolve() + index_path = Path(index_path).expanduser().resolve() stale = overwrite or incremental or worker_id is not None - if output.exists() and not stale: - if return_df: - logging.info("Loading cached index %s", output) - df = pd.read_parquet(output) + if index_path.exists() and not stale: + if return_table: + logging.info("Loading cached index %s", index_path) + tab = BIDSTable.from_parquet(index_path) else: - logging.info("Found cached index %s; nothing to do", output) - df = None - return df + logging.info("Found cached index %s; nothing to do", index_path) + tab = None + return tab if not persistent: logging.info("Building index in memory") df = build_table(source=source, extract=extract_bids_subdir) - return df + tab = BIDSTable.from_df(df) + return tab logging.info("Building persistent Parquet index") build_parquet( source=source, extract=extract_bids_subdir, - output=output, + output=index_path, incremental=incremental, overwrite=overwrite, workers=workers, @@ -96,5 +99,5 @@ def bids2table( path_column="file__file_path", mtime_column="file__mod_time", ) - df = pd.read_parquet(output) if return_df else None - return df + tab = BIDSTable.from_parquet(index_path) if return_table else None + return tab diff --git a/bids2table/extractors/entities.py b/bids2table/entities.py similarity index 68% rename from bids2table/extractors/entities.py rename to bids2table/entities.py index 24c1903..316fcdc 100644 --- a/bids2table/extractors/entities.py +++ b/bids2table/entities.py @@ -1,9 +1,14 @@ +""" +A structured representation for BIDS entities. +""" + import re import warnings from dataclasses import asdict, dataclass, field, fields from functools import lru_cache from pathlib import Path -from typing import Any, Callable, Dict, Iterable, Optional, Union +from types import MappingProxyType +from typing import Any, Callable, Dict, Iterable, List, Optional, Union import pandas as pd from elbow.typing import StrOrPath @@ -26,6 +31,7 @@ def bids_field( name: str, + display_name: str, required: bool = False, allowed_values: Optional[Iterable] = None, default: Optional[Any] = None, @@ -35,9 +41,13 @@ def bids_field( BIDS entity dataclass field. """ if allowed_values is not None: - allowed_values = set(allowed_values) + allowed_values = list(allowed_values) - metadata = dict(name=name, allowed_values=allowed_values) + metadata = { + "name": name, + "display_name": display_name, + "allowed_values": allowed_values, + } if required: fld = field(metadata=metadata) elif default_factory is not None: @@ -60,48 +70,72 @@ class BIDSEntities: https://bids-specification.readthedocs.io/en/stable/appendices/entities.html """ - sub: str = bids_field(name="Subject", required=True) - ses: Optional[str] = bids_field(name="Session") - sample: Optional[str] = bids_field(name="Sample") - task: Optional[str] = bids_field(name="Task") - acq: Optional[str] = bids_field(name="Acquisition") - ce: Optional[str] = bids_field(name="Contrast Enhancing Agent") - trc: Optional[str] = bids_field(name="Tracer") - stain: Optional[str] = bids_field(name="Stain") - rec: Optional[str] = bids_field(name="Reconstruction") - dir: Optional[str] = bids_field(name="Phase-Encoding Direction") - run: Optional[int] = bids_field(name="Run") - mod: Optional[str] = bids_field(name="Corresponding Modality") - echo: Optional[int] = bids_field(name="Echo") - flip: Optional[int] = bids_field(name="Flip Angle") - inv: Optional[int] = bids_field(name="Inversion Time") + sub: str = bids_field(name="subject", display_name="Subject", required=True) + ses: Optional[str] = bids_field(name="session", display_name="Session") + sample: Optional[str] = bids_field(name="sample", display_name="Sample") + task: Optional[str] = bids_field(name="task", display_name="Task") + acq: Optional[str] = bids_field(name="acquisition", display_name="Acquisition") + ce: Optional[str] = bids_field( + name="ceagent", display_name="Contrast Enhancing Agent" + ) + trc: Optional[str] = bids_field(name="tracer", display_name="Tracer") + stain: Optional[str] = bids_field(name="stain", display_name="Stain") + rec: Optional[str] = bids_field( + name="reconstruction", display_name="Reconstruction" + ) + dir: Optional[str] = bids_field( + name="direction", display_name="Phase-Encoding Direction" + ) + run: Optional[int] = bids_field(name="run", display_name="Run") + mod: Optional[str] = bids_field( + name="modality", display_name="Corresponding Modality" + ) + echo: Optional[int] = bids_field(name="echo", display_name="Echo") + flip: Optional[int] = bids_field(name="flip", display_name="Flip Angle") + inv: Optional[int] = bids_field(name="inversion", display_name="Inversion Time") mt: Optional[str] = bids_field( - name="Magnetization Transfer", allowed_values={"on", "off"} + name="mtransfer", + display_name="Magnetization Transfer", + allowed_values={"on", "off"}, ) part: Optional[str] = bids_field( - name="Part", allowed_values={"mag", "phase", "real", "imag"} + name="part", + display_name="Part", + allowed_values={"mag", "phase", "real", "imag"}, + ) + proc: Optional[str] = bids_field( + name="processing", display_name="Processed (on device)" + ) + hemi: Optional[str] = bids_field( + name="hemisphere", display_name="Hemisphere", allowed_values={"L", "R"} ) - proc: Optional[str] = bids_field(name="Processed (on device)") - hemi: Optional[str] = bids_field(name="Hemisphere", allowed_values={"L", "R"}) - space: Optional[str] = bids_field(name="Space") - split: Optional[int] = bids_field(name="Split") - recording: Optional[str] = bids_field(name="Recording") - chunk: Optional[int] = bids_field(name="Chunk") - atlas: Optional[str] = bids_field(name="Atlas") - res: Optional[str] = bids_field(name="Resolution") - den: Optional[str] = bids_field(name="Density") - label: Optional[str] = bids_field(name="Label") - desc: Optional[str] = bids_field(name="Description") + space: Optional[str] = bids_field(name="space", display_name="Space") + split: Optional[int] = bids_field(name="split", display_name="Split") + recording: Optional[str] = bids_field(name="recording", display_name="Recording") + chunk: Optional[int] = bids_field(name="chunk", display_name="Chunk") + atlas: Optional[str] = bids_field(name="atlas", display_name="Atlas") + res: Optional[str] = bids_field(name="resolution", display_name="Resolution") + den: Optional[str] = bids_field(name="density", display_name="Density") + label: Optional[str] = bids_field(name="label", display_name="Label") + desc: Optional[str] = bids_field(name="description", display_name="Description") datatype: Optional[str] = bids_field( - name="Data type", allowed_values=BIDS_DATATYPES + name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES ) - suffix: Optional[str] = bids_field(name="Suffix") - ext: Optional[str] = bids_field(name="Extension") + suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix") + ext: Optional[str] = bids_field(name="extension", display_name="Extension") extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field( - name="Extra entities", + name="extra_entities", + display_name="Extra entities", default_factory=dict, ) + @staticmethod + def special() -> List[str]: + """ + Get list of field keys which are not standard entities. + """ + return ["datatype", "suffix", "ext", "extra_entities"] + @classmethod def from_dict(cls, entities: Dict[str, Any], valid_only: bool = False): """ @@ -309,3 +343,8 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]: if v is not None: entities[k] = v return entities + + +ENTITY_NAMES_TO_KEYS = MappingProxyType( + {f.metadata["name"]: f.name for f in fields(BIDSEntities)} +) diff --git a/bids2table/extractors/__init__.py b/bids2table/extractors/__init__.py index e69de29..05c15ea 100644 --- a/bids2table/extractors/__init__.py +++ b/bids2table/extractors/__init__.py @@ -0,0 +1,3 @@ +""" +[Elbow](https://github.com/cmi-dair/elbow) extract functions for BIDS datasets. +""" diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py index da5363d..b2260da 100644 --- a/bids2table/extractors/bids.py +++ b/bids2table/extractors/bids.py @@ -7,8 +7,9 @@ from elbow.record import Record, concat from elbow.typing import StrOrPath +from bids2table.entities import BIDSEntities + from .dataset import extract_dataset -from .entities import BIDSEntities from .metadata import extract_metadata, is_associated_sidecar @@ -31,7 +32,7 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]: meta_rec = extract_metadata(path) file_rec = extract_file_meta(path) - rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec}) + rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec}) return rec diff --git a/bids2table/extractors/image.py b/bids2table/extractors/image.py index d1155f7..38aed1e 100644 --- a/bids2table/extractors/image.py +++ b/bids2table/extractors/image.py @@ -7,7 +7,7 @@ from elbow.typing import StrOrPath from nibabel.filebasedimages import ImageFileError -from .entities import parse_bids_entities +from bids2table.entities import parse_bids_entities try: import nifti @@ -41,6 +41,9 @@ def extract_image_meta(path: StrOrPath, *, backend: str = "nibabel") -> Record: def _read_image_meta( path: str, backend: str = "nibabel" ) -> Tuple[Dict[str, Any], np.ndarray]: + header: Dict[str, Any] + affine: np.ndarray + if backend == "nifti": if not has_nifti: raise ModuleNotFoundError("nifti image backend not installed") @@ -51,7 +54,12 @@ def _read_image_meta( affine = None else: img = nib.load(path) - header = dict(img.header) + if not isinstance(img, nib.Nifti1Image): + raise TypeError( + f"Foung image type {type(img).__name__}; only Nifti1Image supported" + ) + + header = {k: v for k, v in img.header.items()} affine = np.asarray(img.affine) header = {k: _cast_header_value(v) for k, v in header.items()} diff --git a/bids2table/extractors/_inheritance.py b/bids2table/extractors/inheritance.py similarity index 97% rename from bids2table/extractors/_inheritance.py rename to bids2table/extractors/inheritance.py index 263d0d8..49a06bd 100644 --- a/bids2table/extractors/_inheritance.py +++ b/bids2table/extractors/inheritance.py @@ -4,8 +4,9 @@ from elbow.typing import StrOrPath +from bids2table.entities import parse_bids_entities + from .dataset import is_dataset_root -from .entities import parse_bids_entities def find_bids_parents( diff --git a/bids2table/extractors/metadata.py b/bids2table/extractors/metadata.py index a0efc86..e7df27d 100644 --- a/bids2table/extractors/metadata.py +++ b/bids2table/extractors/metadata.py @@ -6,8 +6,9 @@ from elbow.record import Record from elbow.typing import StrOrPath -from ._inheritance import _glob, find_bids_parents -from .entities import parse_bids_entities +from bids2table.entities import parse_bids_entities + +from .inheritance import _glob, find_bids_parents def extract_metadata(path: StrOrPath) -> Record: diff --git a/bids2table/helpers.py b/bids2table/helpers.py deleted file mode 100644 index 1b6e2eb..0000000 --- a/bids2table/helpers.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path -from typing import Any, Dict, Optional, Union - -import pandas as pd - -from bids2table.extractors.entities import BIDSEntities - - -def join_bids_path( - row: Union[pd.Series, Dict[str, Any]], - prefix: Optional[Union[str, Path]] = None, - valid_only: bool = True, -) -> Path: - """ - Reconstruct a BIDS path from a table row/record or entities dict. - - Example:: - - df = pd.read_parquet("dataset.parquet") - paths = df.apply(join_bids_path, axis=1) - """ - if "entities" in row: - row = row["entities"] - - if isinstance(row, pd.Series): - row = row.to_dict() - - entities = BIDSEntities.from_dict(row, valid_only=valid_only) - path = entities.to_path(prefix=prefix, valid_only=valid_only) - return path - - -def flat_to_multi_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame: - """ - Convert a flat column index to a MultiIndex by splitting on `sep`. - """ - # Do nothing if already a MultiIndex - if isinstance(df.columns, pd.MultiIndex): - return df - - # Do nothing for empty df - # TODO: It would probably be better if the header was initialized even if there are - # no records. - if len(df.columns) == 0: - return df - - split_columns = [col.split(sep) for col in df.columns] - num_levels = max(map(len, split_columns)) - - def _pad_col(col): - return tuple((num_levels - len(col)) * [None] + col) - - df = df.copy(deep=False) - df.columns = pd.MultiIndex.from_tuples(map(_pad_col, split_columns)) - return df - - -def multi_to_flat_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame: - """ - Convert a column MultiIndex to a flat index by joining on `sep`. - """ - # Do nothing if already flat - if not isinstance(df.columns, pd.MultiIndex): - return df - - columns = df.columns.to_flat_index() - join_columns = [sep.join(col) for col in columns] - - df = df.copy(deep=False) - df.columns = pd.Index(join_columns) - return df diff --git a/bids2table/table.py b/bids2table/table.py new file mode 100644 index 0000000..be7cc8b --- /dev/null +++ b/bids2table/table.py @@ -0,0 +1,434 @@ +from dataclasses import dataclass, field +from functools import cached_property +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Union + +import pandas as pd + +from bids2table.entities import ENTITY_NAMES_TO_KEYS, BIDSEntities + + +class BIDSTable(pd.DataFrame): + """ + A table representing one or more BIDS datasets. + + Each row in the table corresponds to a BIDS data file. The table is organized with + several groups of columns: + + - **dataset** (`ds`): dataset name, relative dataset path, and the JSON dataset description + - **entities** (`ent`): All [valid BIDS entities](https://bids-specification.readthedocs.io/en/stable/appendices/entities.html) plus an `extra_entities` dict containing any extra entities + - **metadata** (`meta`): BIDS JSON metadata + - **file info** (`finfo`): General file info including the full file path and last modified time + + It's recommended to create a `BIDSTable` using the main `bids2table.bids2table` + function or use one of the constructor methods: + + - `BIDSTable.from_df` + - `BIDSTable.from_parquet` + + ### Example + + ```python + tab = BIDSTable.from_parquet("dataset/index.b2t") + tab = tab.sort_entities(["dataset", "sub", "ses", "task", "run"]) + tab = ( + tab + .filter("dataset", "ds001") + .filter("sub", items=["04", "06"]) + .filter("RepetitionTime", 2.0) + ) + # Get list of BIDSFiles + files = tab.files + ``` + """ + + @cached_property + def nested(self) -> pd.DataFrame: + """ + A copy of the table with column labels organized in a nested + [`MultiIndex`](https://pandas.pydata.org/docs/user_guide/advanced.html#hierarchical-indexing-multiindex). + """ + # Cast back to the base class since we no longer have the full BIDS table + # structure. + return pd.DataFrame(flat_to_multi_columns(self)) + + @cached_property + def ds(self) -> pd.DataFrame: + """ + The dataset (`ds`) subtable. + """ + return self.nested["ds"] + + @cached_property + def ent(self) -> pd.DataFrame: + """ + The entities (`ent`) subtable. + """ + return self.nested["ent"] + + @cached_property + def meta(self) -> pd.DataFrame: + """ + The metadata (`meta`) subtable. + """ + return self.nested["meta"] + + @cached_property + def finfo(self) -> pd.DataFrame: + """ + The file info (`finfo`) subtable. + """ + return self.nested["finfo"] + + @cached_property + def flat(self) -> pd.DataFrame: + """ + A copy of the table with subtable prefixes e.g. `ds__`, `ent__` removed. + """ + return self.nested.droplevel(0, axis=1) + + @cached_property + def flat_meta(self) -> pd.DataFrame: + """ + A table of flattened JSON metadata where each metadata field is converted to its + own column, with nested levels separated by `'.'`. + + See also: + + - [`pd.json_normalize`](https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html): + more general function in pandas. + """ + # Need to replace None with empty dict for max_level=0 to work. + metadata = pd.json_normalize( + self["meta__json"].map(lambda v: v or {}), max_level=0 + ) + metadata.index = self.index + return metadata + + @cached_property + def files(self) -> List["BIDSFile"]: + """ + Convert the table to a list of structured `BIDSFile`s. + """ + + def to_dict(val): + if pd.isna(val): + return {} + return dict(val) + + return [ + BIDSFile( + dataset=row["ds"]["dataset"], + root=Path(row["ds"]["dataset_path"]), + path=Path(row["finfo"]["file_path"]), + entities=BIDSEntities.from_dict(row["ent"]), + metadata=to_dict(row["meta"]["json"]), + ) + for _, row in self.nested.iterrows() + ] + + @cached_property + def datatypes(self) -> List[str]: + """ + Get all datatypes present in the table. + """ + return self.ent["datatype"].unique().tolist() + + @cached_property + def modalities(self) -> List[str]: + """ + Get all modalities present in the table. + """ + # TODO: Is this the right way to get the modality + return self.ent["mod"].unique().tolist() + + @cached_property + def subjects(self) -> List[str]: + """ + Get all unique subjects in the table. + """ + return self.ent["sub"].unique().tolist() + + @cached_property + def entities(self) -> List[str]: + """ + Get all entity keys with at least one non-NA entry in the table. + """ + entities = self.ent.dropna(axis=1, how="all").columns.tolist() + special = set(BIDSEntities.special()) + return [key for key in entities if key not in special] + + def filter( + self, + key: str, + value: Optional[Any] = None, + *, + items: Optional[Iterable[Any]] = None, + contains: Optional[str] = None, + regex: Optional[str] = None, + func: Optional[Callable[[Any], bool]] = None, + ) -> "BIDSTable": + """ + Filter the rows of the table. + + Args: + key: Column to filter. Can be a metadata field, BIDS entity name, or any + unprefixed column label in the `flat` table. + value: Keep rows with this exact value. + items: Keep rows whose value is in `items`. + contains: Keep rows whose value contains `contains` (string only). + regex: Keep rows whose value matches `regex` (string only). + func: Apply an arbitrary function and keep values that evaluate to `True`. + + Returns: + A filtered BIDS table. + + Example:: + filtered = ( + tab + .filter("dataset", "ds001") + .filter("sub", items=["04", "06"]) + .filter("RepetitionTime", 2.0) + ) + """ + # NOTE: Should be careful about reinventing a new style of query API. There are + # some obvious things this can't do: + # - comparison operators <, >, <=, >= + # - negation + # - combining filters with 'or' instead of 'and' + # At the bottom of this rabbit hole are more general query interfaces like those + # already implemented in pandas, duckdb, polars. The goal should be not to + # create a new one, but to make the 95% of use cases as easy as possible, and + # empower users to interact with the underlying table using their more powerful + # tool of choice if necessary. + if sum(k is not None for k in [value, items, contains, regex, func]) != 1: + raise ValueError( + "Exactly one of value, items, contains, regex, or func must not be None" + ) + + try: + # JSON metadata field + # NOTE: Assuming all JSON metadata fields are uppercase. + if key[:1].isupper(): + col = self.flat_meta[key] + # Long name entity + elif key in ENTITY_NAMES_TO_KEYS: + col = self.ent[ENTITY_NAMES_TO_KEYS[key]] + # Any other unprefixed column + else: + col = self.flat[key] + except KeyError as exc: + raise KeyError( + f"Invalid key {key}; expected a valid BIDS entity or metadata field " + "present in the dataset" + ) from exc + + if value is not None: + mask = col == value + elif items is not None: + mask = col.isin(items) + elif contains is not None: + mask = col.str.contains(contains) + elif regex is not None: + mask = col.str.match(regex) + else: + mask = col.apply(func) + mask = mask.fillna(False).astype(bool) + + return self.loc[mask] + + def filter_multi(self, **filters) -> "BIDSTable": + """ + Apply multiple filters to the table sequentially. + + Args: + filters: A mapping of column labels to queries. Each query can either be + a single value for an exact equality check or a `dict` for a more + complex query, e.g. `{"items": [1, 2, 3]}`, that's passed through to + `filter`. + + Returns: + A filtered BIDS table. + + Example:: + filtered = tab.filter_multi( + dataset="ds001" + sub={"items": ["04", "06"]}, + RepetitionTime=2.5, + ) + """ + tab = self.copy(deep=False) + + for k, query in filters.items(): + if not isinstance(query, dict): + query = {"value": query} + tab = tab.filter(k, **query) + return tab + + def sort_entities( + self, by: Union[str, List[str]], inplace: bool = False + ) -> "BIDSTable": + """ + Sort the values of the table by entities. + + Args: + by: label or list of labels. Can be `"dataset"` or a short or long entity + name. + inplace: sort the table in place + + Returns: + A sorted BIDS table. + """ + if isinstance(by, str): + by = [by] + + # TODO: what about sorting by other columns, e.g. file_path? + def add_prefix(k: str): + if k == "dataset": + k = f"ds__{k}" + elif k in ENTITY_NAMES_TO_KEYS: + k = f"ent__{ENTITY_NAMES_TO_KEYS[k]}" + else: + k = f"ent__{k}" + return k + + by = [add_prefix(k) for k in by] + out = self.sort_values(by, inplace=inplace) + if inplace: + return self + return out + + @classmethod + def from_df(cls, df: pd.DataFrame) -> "BIDSTable": + """ + Create a BIDS table from a pandas `DataFrame` generated by `bids2table`. + """ + return cls(df) + + @classmethod + def from_parquet(cls, path: Path) -> "BIDSTable": + """ + Read a BIDS table from a Parquet file or dataset directory generated by + `bids2table`. + """ + df = pd.read_parquet(path) + return cls.from_df(df) + + @property + def _constructor(self): + # Makes sure that dataframe slices return a subclass instance + # https://pandas.pydata.org/docs/development/extending.html#override-constructor-properties + return BIDSTable + + +@dataclass +class BIDSFile: + """ + A structured BIDS file. + """ + + dataset: str + """Parent BIDS dataset.""" + root: Path + """Path to parent dataset.""" + path: Path + """File path.""" + entities: BIDSEntities + """BIDS entities.""" + metadata: Dict[str, Any] = field(default_factory=dict) + """BIDS JSON metadata.""" + + @property + def relative_path(self) -> Path: + """ + The file path relative to the dataset root. + """ + return self.path.relative_to(self.root) + + +def flat_to_multi_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame: + """ + Convert a flat column index to a MultiIndex by splitting on `sep`. + """ + # Do nothing if already a MultiIndex + if isinstance(df.columns, pd.MultiIndex): + return df + + # Do nothing for empty df + # TODO: It would probably be better if the header was initialized even if there are + # no records. + if len(df.columns) == 0: + return df + + split_columns = [col.split(sep) for col in df.columns] + num_levels = max(map(len, split_columns)) + + def _pad_col(col): + return tuple((num_levels - len(col)) * [None] + col) + + df = df.copy(deep=False) + df.columns = pd.MultiIndex.from_tuples(map(_pad_col, split_columns)) + return df + + +def multi_to_flat_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame: + """ + Convert a column MultiIndex to a flat index by joining on `sep`. + """ + # Do nothing if already flat + if not isinstance(df.columns, pd.MultiIndex): + return df + + columns = df.columns.to_flat_index() + join_columns = [sep.join(col) for col in columns] + + df = df.copy(deep=False) + df.columns = pd.Index(join_columns) + return df + + +def join_bids_path( + row: Union[pd.Series, Dict[str, Any]], + prefix: Optional[Union[str, Path]] = None, + valid_only: bool = True, +) -> Path: + """ + Reconstruct a BIDS path from a table row or entities dict. + + Args: + row: row from a `BIDSTable` or `BIDSTable.ent` subtable. + prefix: output file prefix path. + valid_only: only include valid BIDS entities. + + Example:: + + tab = BIDSTable.from_parquet("dataset/index.b2t") + paths = tab.apply(join_bids_path, axis=1) + """ + # Filter in case input is a row from the raw dataframe and not the entities group. + row = _filter_row(row, group="ent") + entities = BIDSEntities.from_dict(row, valid_only=valid_only) + path = entities.to_path(prefix=prefix, valid_only=valid_only) + return path + + +def _filter_row( + row: Union[pd.Series, Dict[str, Any]], group: str, sep: str = "__" +) -> Dict[str, Any]: + """ + Filter a table row for fields from a particular group. Keeps all fields without a + group prefix. + """ + prefix = f"{group}{sep}" + return { + _removeprefix(k, prefix): v + for k, v in row.items() + if k.startswith(prefix) or sep not in k + } + + +def _removeprefix(s: str, prefix: str) -> str: + # same as str.removeprefix(), which was introduced in 3.9 + if s.startswith(prefix): + s = s[len(prefix) :] + return s diff --git a/example/example.ipynb b/example/example.ipynb index 5544473..d269258 100644 --- a/example/example.ipynb +++ b/example/example.ipynb @@ -6,12 +6,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Required to load columns with extension types, e.g. json type\n", - "import elbow.dtypes\n", "import pandas as pd\n", "\n", - "from bids2table import bids2table\n", - "from bids2table.helpers import flat_to_multi_columns" + "from bids2table import bids2table" ] }, { @@ -45,15 +42,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "176it [00:01, 92.58it/s, tot=176, good=176, rec=2245, err=0] \n", - "197it [00:02, 90.73it/s, tot=197, good=197, rec=2663, err=0] \n", - "203it [00:02, 93.22it/s, tot=203, good=203, rec=2630, err=0]\n", - "204it [00:02, 92.01it/s, tot=204, good=204, rec=2728, err=0] \n" + "193it [00:00, 318.09it/s, tot=193, good=193, rec=2386, err=0]\n", + "172it [00:00, 288.23it/s, tot=172, good=172, rec=2240, err=0]\n", + "202it [00:00, 287.97it/s, tot=202, good=202, rec=2828, err=0]\n", + "213it [00:00, 300.22it/s, tot=213, good=213, rec=2812, err=0]\n" ] } ], "source": [ - "df = bids2table(root=\"../bids-examples\", persistent=True, overwrite=True, workers=4)" + "bids2table(\n", + " root=\"../bids-examples\",\n", + " index_path=\"bids-examples.b2t\",\n", + " persistent=True,\n", + " overwrite=True,\n", + " workers=4,\n", + " return_table=False,\n", + ")" ] }, { @@ -77,16 +81,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 1992\n", - "-rw------- 1 clane staff 248K Aug 4 12:34 part-20230804123438-0003-of-0004.parquet\n", - "-rw------- 1 clane staff 247K Aug 4 12:34 part-20230804123438-0002-of-0004.parquet\n", - "-rw------- 1 clane staff 175K Aug 4 12:34 part-20230804123438-0001-of-0004.parquet\n", - "-rw------- 1 clane staff 161K Aug 4 12:34 part-20230804123438-0000-of-0004.parquet\n" + "total 1608\n", + "-rw------- 1 clane staff 197K Aug 9 06:17 part-20230809061750-0002-of-0004.parquet\n", + "-rw------- 1 clane staff 240K Aug 9 06:17 part-20230809061750-0003-of-0004.parquet\n", + "-rw------- 1 clane staff 167K Aug 9 06:17 part-20230809061750-0000-of-0004.parquet\n", + "-rw------- 1 clane staff 194K Aug 9 06:17 part-20230809061750-0001-of-0004.parquet\n" ] } ], "source": [ - "! ls -lht ../bids-examples/index.b2t" + "! ls -lht bids-examples.b2t/" ] }, { @@ -103,7 +107,7 @@ "- dataset (`ds__*`): dataset name, relative dataset path, and the JSON dataset description\n", "- entities (`ent__*`): All [valid BIDS entities](https://bids-specification.readthedocs.io/en/stable/appendices/entities.html) plus an `extra_entities` dict containing any extra entities\n", "- metadata (`meta__*`): BIDS JSON metadata\n", - "- file (`file__*`): General file metadata including the full file path and last modified time" + "- file info (`finfo__*`): General file info including the full file path and last modified time" ] }, { @@ -111,6 +115,13 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape: (10266, 40)\n" + ] + }, { "data": { "text/html": [ @@ -169,19 +180,19 @@ " ent__ext\n", " ent__extra_entities\n", " meta__json\n", - " file__file_path\n", - " file__link_target\n", - " file__mod_time\n", + " finfo__file_path\n", + " finfo__link_target\n", + " finfo__mod_time\n", " \n", " \n", " \n", " \n", " 0\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -213,18 +224,18 @@ " T1w\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 1\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -252,32 +263,32 @@ " None\n", " None\n", " None\n", - " perf\n", - " m0scan\n", + " anat\n", + " inplaneT2\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 2\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", + " probabilisticclassification\n", " None\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " NaN\n", + " 1.0\n", " None\n", " NaN\n", " NaN\n", @@ -295,74 +306,74 @@ " None\n", " None\n", " None\n", - " perf\n", - " asl\n", + " func\n", + " bold\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'RepetitionTime': 2.0, 'TaskName': 'probabili...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ds__dataset ds__dataset_type \n", - "0 asl002 raw \\\n", - "1 asl002 raw \n", - "2 asl002 raw \n", - "\n", - " ds__dataset_path \n", - "0 /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "1 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "2 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "\n", - " ds__dataset_description ent__sub ent__ses \n", - "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \\\n", - "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \n", - "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None \n", - "\n", - " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \n", - "0 None None None None None None None \\\n", - "1 None None None None None None None \n", - "2 None None None None None None None \n", - "\n", - " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \n", - "0 None NaN None NaN NaN NaN None \\\n", - "1 None NaN None NaN NaN NaN None \n", - "2 None NaN None NaN NaN NaN None \n", - "\n", - " ent__part ent__proc ent__hemi ent__space ent__split ent__recording \n", - "0 None None None None NaN None \\\n", - "1 None None None None NaN None \n", - "2 None None None None NaN None \n", - "\n", - " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc ent__datatype \n", - "0 NaN None None None None None anat \\\n", - "1 NaN None None None None None perf \n", - "2 NaN None None None None None perf \n", - "\n", - " ent__suffix ent__ext ent__extra_entities \n", - "0 T1w .nii.gz {} \\\n", - "1 m0scan .nii.gz {} \n", - "2 asl .nii.gz {} \n", - "\n", - " meta__json \n", - "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", - "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", - "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", - "\n", - " file__file_path file__link_target \n", - "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None \\\n", - "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", - "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", - "\n", - " file__mod_time \n", - "0 1.687883e+09 \n", - "1 1.687883e+09 \n", - "2 1.687883e+09 " + " ds__dataset ds__dataset_type \\\n", + "0 ds002 None \n", + "1 ds002 None \n", + "2 ds002 None \n", + "\n", + " ds__dataset_path \\\n", + "0 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "2 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "\n", + " ds__dataset_description ent__sub ent__ses \\\n", + "0 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None \n", + "1 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None \n", + "2 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None \n", + "\n", + " ent__sample ent__task ent__acq ent__ce ent__trc \\\n", + "0 None None None None None \n", + "1 None None None None None \n", + "2 None probabilisticclassification None None None \n", + "\n", + " ent__stain ent__rec ent__dir ent__run ent__mod ent__echo ent__flip \\\n", + "0 None None None NaN None NaN NaN \n", + "1 None None None NaN None NaN NaN \n", + "2 None None None 1.0 None NaN NaN \n", + "\n", + " ent__inv ent__mt ent__part ent__proc ent__hemi ent__space ent__split \\\n", + "0 NaN None None None None None NaN \n", + "1 NaN None None None None None NaN \n", + "2 NaN None None None None None NaN \n", + "\n", + " ent__recording ent__chunk ent__atlas ent__res ent__den ent__label \\\n", + "0 None NaN None None None None \n", + "1 None NaN None None None None \n", + "2 None NaN None None None None \n", + "\n", + " ent__desc ent__datatype ent__suffix ent__ext ent__extra_entities \\\n", + "0 None anat T1w .nii.gz {} \n", + "1 None anat inplaneT2 .nii.gz {} \n", + "2 None func bold .nii.gz {} \n", + "\n", + " meta__json \\\n", + "0 None \n", + "1 None \n", + "2 {'RepetitionTime': 2.0, 'TaskName': 'probabili... \n", + "\n", + " finfo__file_path finfo__link_target \\\n", + "0 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "1 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "2 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "\n", + " finfo__mod_time \n", + "0 1.691420e+09 \n", + "1 1.691420e+09 \n", + "2 1.691420e+09 " ] }, "execution_count": 5, @@ -371,22 +382,182 @@ } ], "source": [ - "df = bids2table(\"../bids-examples\")\n", - "\n", - "df.head(3)" + "tab = bids2table(\"../bids-examples\", index_path=\"bids-examples.b2t\")\n", + "print(\"Shape:\", tab.shape)\n", + "tab.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can also split the columns into a pandas [`MultiIndex`](https://pandas.pydata.org/docs/user_guide/advanced.html) using the helper function `flat_to_multi_columns()`." + "Now let's look at the column types.\n", + "\n", + "> TODO: not all types are preserved when converting parquet to pandas. In particular, strings are mapped to objects and ints with None to float with NaN." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ds__datasetds__dataset_typeds__dataset_pathds__dataset_descriptionent__subent__sesent__sampleent__taskent__acqent__ceent__trcent__stainent__recent__dirent__runent__modent__echoent__flipent__invent__mtent__partent__procent__hemient__spaceent__splitent__recordingent__chunkent__atlasent__resent__denent__labelent__descent__datatypeent__suffixent__extent__extra_entitiesmeta__jsonfinfo__file_pathfinfo__link_targetfinfo__mod_time
0objectobjectobjectjsonobjectobjectobjectobjectobjectobjectobjectobjectobjectobjectfloat64objectfloat64float64float64objectobjectobjectobjectobjectfloat64objectfloat64objectobjectobjectobjectobjectobjectobjectobjectjsonjsonobjectobjectfloat64
\n", + "
" + ], + "text/plain": [ + " ds__dataset ds__dataset_type ds__dataset_path ds__dataset_description \\\n", + "0 object object object json \n", + "\n", + " ent__sub ent__ses ent__sample ent__task ent__acq ent__ce ent__trc \\\n", + "0 object object object object object object object \n", + "\n", + " ent__stain ent__rec ent__dir ent__run ent__mod ent__echo ent__flip ent__inv \\\n", + "0 object object object float64 object float64 float64 float64 \n", + "\n", + " ent__mt ent__part ent__proc ent__hemi ent__space ent__split ent__recording \\\n", + "0 object object object object object float64 object \n", + "\n", + " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc ent__datatype \\\n", + "0 float64 object object object object object object \n", + "\n", + " ent__suffix ent__ext ent__extra_entities meta__json finfo__file_path \\\n", + "0 object object json json object \n", + "\n", + " finfo__link_target finfo__mod_time \n", + "0 object float64 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema = pd.DataFrame.from_records([tab.dtypes.to_dict()])\n", + "schema" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframe returned by `bids2table` is in fact a special `BIDSTable` subclass of `pandas.DataFrame` with a few extra helper methods.\n", + "\n", + "- You can view the table with [nested columns](https://pandas.pydata.org/docs/user_guide/advanced.html#hierarchical-indexing-multiindex)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [ { "data": { @@ -412,7 +583,7 @@ " ds\n", " ent\n", " meta\n", - " file\n", + " finfo\n", " \n", " \n", " \n", @@ -461,11 +632,11 @@ " \n", " \n", " 0\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -497,18 +668,18 @@ " T1w\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 1\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -536,32 +707,32 @@ " None\n", " None\n", " None\n", - " perf\n", - " m0scan\n", + " anat\n", + " inplaneT2\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 2\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", + " probabilisticclassification\n", " None\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " NaN\n", + " 1.0\n", " None\n", " NaN\n", " NaN\n", @@ -579,84 +750,82 @@ " None\n", " None\n", " None\n", - " perf\n", - " asl\n", + " func\n", + " bold\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'RepetitionTime': 2.0, 'TaskName': 'probabili...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ds \n", + " ds \\\n", " dataset dataset_type dataset_path \n", - "0 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "1 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "2 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "\n", - " ent \n", - " dataset_description sub ses sample \n", - "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \\\n", - "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", - "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", - "\n", - " \n", - " task acq ce trc stain rec dir run mod echo flip inv mt \n", - "0 None None None None None None None NaN None NaN NaN NaN None \\\n", - "1 None None None None None None None NaN None NaN NaN NaN None \n", - "2 None None None None None None None NaN None NaN NaN NaN None \n", - "\n", - " \n", - " part proc hemi space split recording chunk atlas res den label desc \n", - "0 None None None None NaN None NaN None None None None None \\\n", - "1 None None None None NaN None NaN None None None None None \n", - "2 None None None None NaN None NaN None None None None None \n", - "\n", - " \n", - " datatype suffix ext extra_entities \n", - "0 anat T1w .nii.gz {} \\\n", - "1 perf m0scan .nii.gz {} \n", - "2 perf asl .nii.gz {} \n", - "\n", - " meta \n", + "0 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "2 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "\n", + " ent \\\n", + " dataset_description sub ses sample \n", + "0 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "1 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "2 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "\n", + " \\\n", + " task acq ce trc stain rec dir run mod \n", + "0 None None None None None None None NaN None \n", + "1 None None None None None None None NaN None \n", + "2 probabilisticclassification None None None None None None 1.0 None \n", + "\n", + " \\\n", + " echo flip inv mt part proc hemi space split recording chunk atlas \n", + "0 NaN NaN NaN None None None None None NaN None NaN None \n", + "1 NaN NaN NaN None None None None None NaN None NaN None \n", + "2 NaN NaN NaN None None None None None NaN None NaN None \n", + "\n", + " \\\n", + " res den label desc datatype suffix ext extra_entities \n", + "0 None None None None anat T1w .nii.gz {} \n", + "1 None None None None anat inplaneT2 .nii.gz {} \n", + "2 None None None None func bold .nii.gz {} \n", + "\n", + " meta \\\n", " json \n", - "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", - "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", - "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + "0 None \n", + "1 None \n", + "2 {'RepetitionTime': 2.0, 'TaskName': 'probabili... \n", "\n", - " file \n", + " finfo \n", " file_path link_target mod_time \n", - "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", - "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", - "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 " + "0 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 \n", + "1 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 \n", + "2 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_multi = flat_to_multi_columns(df)\n", - "\n", - "df_multi.head(3)" + "tab.nested.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This makes it easy to extract just a single group of columns, e.g. the BIDS entities." + "- You can easily access the dataset (`ds`), entities (`ent`), metadata (`meta`), or file info (`finfo`) subtables." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -717,7 +886,7 @@ " \n", " \n", " 0\n", - " Sub103\n", + " 15\n", " None\n", " None\n", " None\n", @@ -752,7 +921,7 @@ " \n", " \n", " 1\n", - " Sub103\n", + " 15\n", " None\n", " None\n", " None\n", @@ -780,24 +949,24 @@ " None\n", " None\n", " None\n", - " perf\n", - " m0scan\n", + " anat\n", + " inplaneT2\n", " .nii.gz\n", " {}\n", " \n", " \n", " 2\n", - " Sub103\n", - " None\n", + " 15\n", " None\n", " None\n", + " probabilisticclassification\n", " None\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " NaN\n", + " 1.0\n", " None\n", " NaN\n", " NaN\n", @@ -815,8 +984,8 @@ " None\n", " None\n", " None\n", - " perf\n", - " asl\n", + " func\n", + " bold\n", " .nii.gz\n", " {}\n", " \n", @@ -825,43 +994,46 @@ "" ], "text/plain": [ - " sub ses sample task acq ce trc stain rec dir run mod \n", - "0 Sub103 None None None None None None None None None NaN None \\\n", - "1 Sub103 None None None None None None None None None NaN None \n", - "2 Sub103 None None None None None None None None None NaN None \n", + " sub ses sample task acq ce trc stain rec \\\n", + "0 15 None None None None None None None None \n", + "1 15 None None None None None None None None \n", + "2 15 None None probabilisticclassification None None None None None \n", "\n", - " echo flip inv mt part proc hemi space split recording chunk \n", - "0 NaN NaN NaN None None None None None NaN None NaN \\\n", - "1 NaN NaN NaN None None None None None NaN None NaN \n", - "2 NaN NaN NaN None None None None None NaN None NaN \n", + " dir run mod echo flip inv mt part proc hemi space split \\\n", + "0 None NaN None NaN NaN NaN None None None None None NaN \n", + "1 None NaN None NaN NaN NaN None None None None None NaN \n", + "2 None 1.0 None NaN NaN NaN None None None None None NaN \n", "\n", - " atlas res den label desc datatype suffix ext extra_entities \n", - "0 None None None None None anat T1w .nii.gz {} \n", - "1 None None None None None perf m0scan .nii.gz {} \n", - "2 None None None None None perf asl .nii.gz {} " + " recording chunk atlas res den label desc datatype suffix ext \\\n", + "0 None NaN None None None None None anat T1w .nii.gz \n", + "1 None NaN None None None None None anat inplaneT2 .nii.gz \n", + "2 None NaN None None None None None func bold .nii.gz \n", + "\n", + " extra_entities \n", + "0 {} \n", + "1 {} \n", + "2 {} " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "entities = df_multi[\"ent\"]\n", - "\n", - "entities.head(3)" + "tab.ent.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also drop the first level of the column multi-index for shorter column names." + "- You can view the full table without the group prefixes" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -930,11 +1102,11 @@ " \n", " \n", " 0\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -966,18 +1138,18 @@ " T1w\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 1\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", + " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", " None\n", @@ -1005,32 +1177,32 @@ " None\n", " None\n", " None\n", - " perf\n", - " m0scan\n", + " anat\n", + " inplaneT2\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", " None\n", - " 1.687883e+09\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", " \n", " \n", " 2\n", - " asl002\n", - " raw\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi...\n", - " Sub103\n", + " ds002\n", " None\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'BIDSVersion': '1.0.0', 'License': 'This data...\n", + " 15\n", " None\n", " None\n", + " probabilisticclassification\n", " None\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " NaN\n", + " 1.0\n", " None\n", " NaN\n", " NaN\n", @@ -1048,161 +1220,987 @@ " None\n", " None\n", " None\n", - " perf\n", - " asl\n", + " func\n", + " bold\n", " .nii.gz\n", " {}\n", - " {'Manufacturer': 'Philips', 'ManufacturersMode...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'RepetitionTime': 2.0, 'TaskName': 'probabili...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", "\n", "" ], "text/plain": [ - " dataset dataset_type dataset_path \n", - "0 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "1 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "2 asl002 raw /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "\n", - " dataset_description sub ses sample \n", - "0 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \\\n", - "1 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", - "2 {'Name': 'ASL_Philips_PCASL_2DEPI', 'BIDSVersi... Sub103 None None \n", - "\n", - " task acq ce trc stain rec dir run mod echo flip inv mt \n", - "0 None None None None None None None NaN None NaN NaN NaN None \\\n", - "1 None None None None None None None NaN None NaN NaN NaN None \n", - "2 None None None None None None None NaN None NaN NaN NaN None \n", - "\n", - " part proc hemi space split recording chunk atlas res den label \n", - "0 None None None None NaN None NaN None None None None \\\n", - "1 None None None None NaN None NaN None None None None \n", - "2 None None None None NaN None NaN None None None None \n", - "\n", - " desc datatype suffix ext extra_entities \n", - "0 None anat T1w .nii.gz {} \\\n", - "1 None perf m0scan .nii.gz {} \n", - "2 None perf asl .nii.gz {} \n", + " dataset dataset_type dataset_path \\\n", + "0 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "2 ds002 None /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "\n", + " dataset_description sub ses sample \\\n", + "0 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "1 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "2 {'BIDSVersion': '1.0.0', 'License': 'This data... 15 None None \n", + "\n", + " task acq ce trc stain rec dir run mod \\\n", + "0 None None None None None None None NaN None \n", + "1 None None None None None None None NaN None \n", + "2 probabilisticclassification None None None None None None 1.0 None \n", + "\n", + " echo flip inv mt part proc hemi space split recording chunk \\\n", + "0 NaN NaN NaN None None None None None NaN None NaN \n", + "1 NaN NaN NaN None None None None None NaN None NaN \n", + "2 NaN NaN NaN None None None None None NaN None NaN \n", + "\n", + " atlas res den label desc datatype suffix ext extra_entities \\\n", + "0 None None None None None anat T1w .nii.gz {} \n", + "1 None None None None None anat inplaneT2 .nii.gz {} \n", + "2 None None None None None func bold .nii.gz {} \n", + "\n", + " json \\\n", + "0 None \n", + "1 None \n", + "2 {'RepetitionTime': 2.0, 'TaskName': 'probabili... \n", + "\n", + " file_path link_target mod_time \n", + "0 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 \n", + "1 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 \n", + "2 /Users/clane/Projects/B2T/bids2table/bids-exam... None 1.691420e+09 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tab.flat.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- You can access flattened JSON metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RepetitionTimeTaskNameInstitutionAddressInstitutionNameInstitutionalDepartmentNamePowerLineFrequencyManufacturersModelNameEEGReferenceManufacturerEEGChannelCountMiscChannelCountRecordingTypeRecordingDurationSamplingFrequencyEOGChannelCountECGChannelCountEMGChannelCountSoftwareFiltersonsetdurationtrial_typeresponse_timesamplevalueSoftwareVersionsMagneticFieldStrengthReceiveCoilNameReceiveCoilActiveElementsScanningSequenceSequenceVariantScanOptionsSequenceNamePulseSequenceDetailsParallelReductionFactorInPlanePartialFourierEchoTimeInversionTimeDwellTimeFlipAngleMRAcquisitionTypePulseSequenceTypePhaseEncodingDirectionEffectiveEchoSpacingTotalReadoutTimeRepetitionTimePreparationIntendedForAcquisitionVoxelsizeNumberShotsArterialSpinLabelingTypePostLabelingDelay...SpoilingRFPhaseIncrementMagneticFliedStrengthPulseSequenceSpoilingStateSpoilingTypeSpoilingGradientMomentSpoilingGradientDurationa_comp_cor_179a_comp_cor_180a_comp_cor_181a_comp_cor_182a_comp_cor_183t_comp_cor_06a_comp_cor_184a_comp_cor_185a_comp_cor_186a_comp_cor_187a_comp_cor_188aroma_motion_45aroma_motion_46aroma_motion_47aroma_motion_48aroma_motion_49aroma_motion_50aroma_motion_51aroma_motion_52aroma_motion_53dropped_568dropped_569dropped_570dropped_571PharmaceuticalNamePharmaceuticalDoseAmountPharmaceuticalDoseAmountUnitsPharmaceuticalDoseRegimenPharmaceuticalDoseTimeInfusionRadioactivityInfusionStartInfusionSpeedInfusionSpeedUnitsInjectedVolumeTracerInjectionTypeInjectionEndAttenuationCorrectionMethodReferenceNonLinearGradientCorrectionPhaseOversamplingPercentSamplingInjectedMassPerWeightInjectedMassPerWeightUnitsElectricalStimulationParameters
0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22.0probabilistic classificationNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 1177 columns

\n", + "
" + ], + "text/plain": [ + " RepetitionTime TaskName InstitutionAddress \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 2.0 probabilistic classification NaN \n", + "\n", + " InstitutionName InstitutionalDepartmentName PowerLineFrequency \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " ManufacturersModelName EEGReference Manufacturer EEGChannelCount \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " MiscChannelCount RecordingType RecordingDuration SamplingFrequency \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " EOGChannelCount ECGChannelCount EMGChannelCount SoftwareFilters onset \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "\n", + " duration trial_type response_time sample value SoftwareVersions \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "\n", + " MagneticFieldStrength ReceiveCoilName ReceiveCoilActiveElements \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " ScanningSequence SequenceVariant ScanOptions SequenceName \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " PulseSequenceDetails ParallelReductionFactorInPlane PartialFourier \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " EchoTime InversionTime DwellTime FlipAngle MRAcquisitionType \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "\n", + " PulseSequenceType PhaseEncodingDirection EffectiveEchoSpacing \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " TotalReadoutTime RepetitionTimePreparation IntendedFor \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " AcquisitionVoxelsize NumberShots ArterialSpinLabelingType \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " PostLabelingDelay ... SpoilingRFPhaseIncrement MagneticFliedStrength \\\n", + "0 NaN ... NaN NaN \n", + "1 NaN ... NaN NaN \n", + "2 NaN ... NaN NaN \n", + "\n", + " PulseSequence SpoilingState SpoilingType SpoilingGradientMoment \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " SpoilingGradientDuration a_comp_cor_179 a_comp_cor_180 a_comp_cor_181 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " a_comp_cor_182 a_comp_cor_183 t_comp_cor_06 a_comp_cor_184 a_comp_cor_185 \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "\n", + " a_comp_cor_186 a_comp_cor_187 a_comp_cor_188 aroma_motion_45 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " aroma_motion_46 aroma_motion_47 aroma_motion_48 aroma_motion_49 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " aroma_motion_50 aroma_motion_51 aroma_motion_52 aroma_motion_53 dropped_568 \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "\n", + " dropped_569 dropped_570 dropped_571 PharmaceuticalName \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " PharmaceuticalDoseAmount PharmaceuticalDoseAmountUnits \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "\n", + " PharmaceuticalDoseRegimen PharmaceuticalDoseTime InfusionRadioactivity \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " InfusionStart InfusionSpeed InfusionSpeedUnits InjectedVolume \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "\n", + " TracerInjectionType InjectionEnd AttenuationCorrectionMethodReference \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " NonLinearGradientCorrection PhaseOversampling PercentSampling \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "\n", + " InjectedMassPerWeight InjectedMassPerWeightUnits \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "\n", + " ElectricalStimulationParameters \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "\n", + "[3 rows x 1177 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tab.flat_meta.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- You can still slice the table and get back a `BIDSTable`" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (500, 40)\n" + ] + } + ], + "source": [ + "subtab = tab.iloc[:500]\n", + "print(type(subtab), subtab.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sorting rows\n", + "\n", + "By default the rows are in arbitrary order. We can sort the rows by dataset, subject, session, task, and run." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ds__datasetds__dataset_typeds__dataset_pathds__dataset_descriptionent__subent__sesent__sampleent__taskent__acqent__ceent__trcent__stainent__recent__dirent__runent__modent__echoent__flipent__invent__mtent__partent__procent__hemient__spaceent__splitent__recordingent__chunkent__atlasent__resent__denent__labelent__descent__datatypeent__suffixent__extent__extra_entitiesmeta__jsonfinfo__file_pathfinfo__link_targetfinfo__mod_time
37887t_trtNone/Users/clane/Projects/B2T/bids2table/bids-exam...{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}011NonerestfullbrainNoneNoneNoneNoneNone1.0NoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNonefuncbold.nii.gz{}{'CogAtlasID': 'https://www.cognitiveatlas.org.../Users/clane/Projects/B2T/bids2table/bids-exam...None1.691420e+09
37907t_trtNone/Users/clane/Projects/B2T/bids2table/bids-exam...{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}011NonerestfullbrainNoneNoneNoneNoneNone1.0NoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNonefuncphysio.tsv.gz{}{'StartTime': 0, 'SamplingFrequency': 100, 'Co.../Users/clane/Projects/B2T/bids2table/bids-exam...None1.691420e+09
37867t_trtNone/Users/clane/Projects/B2T/bids2table/bids-exam...{'BIDSVersion': '1.8.0', 'Name': '7t_trt'}011NonerestfullbrainNoneNoneNoneNoneNone2.0NoneNaNNaNNaNNoneNoneNoneNoneNoneNaNNoneNaNNoneNoneNoneNoneNonefuncbold.nii.gz{}{'CogAtlasID': 'https://www.cognitiveatlas.org.../Users/clane/Projects/B2T/bids2table/bids-exam...None1.691420e+09
\n", + "
" + ], + "text/plain": [ + " ds__dataset ds__dataset_type \\\n", + "3788 7t_trt None \n", + "3790 7t_trt None \n", + "3786 7t_trt None \n", "\n", - " json \n", - "0 {'Manufacturer': 'Philips', 'ManufacturersMode... \\\n", - "1 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", - "2 {'Manufacturer': 'Philips', 'ManufacturersMode... \n", + " ds__dataset_path \\\n", + "3788 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "3790 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "3786 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", "\n", - " file_path link_target mod_time \n", - "0 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", - "1 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 \n", - "2 /Users/clane/Projects/ScalableQC/code/bids2tab... None 1.687883e+09 " + " ds__dataset_description ent__sub ent__ses \\\n", + "3788 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", + "3790 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", + "3786 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", + "\n", + " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \\\n", + "3788 None rest fullbrain None None None None \n", + "3790 None rest fullbrain None None None None \n", + "3786 None rest fullbrain None None None None \n", + "\n", + " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \\\n", + "3788 None 1.0 None NaN NaN NaN None \n", + "3790 None 1.0 None NaN NaN NaN None \n", + "3786 None 2.0 None NaN NaN NaN None \n", + "\n", + " ent__part ent__proc ent__hemi ent__space ent__split ent__recording \\\n", + "3788 None None None None NaN None \n", + "3790 None None None None NaN None \n", + "3786 None None None None NaN None \n", + "\n", + " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc \\\n", + "3788 NaN None None None None None \n", + "3790 NaN None None None None None \n", + "3786 NaN None None None None None \n", + "\n", + " ent__datatype ent__suffix ent__ext ent__extra_entities \\\n", + "3788 func bold .nii.gz {} \n", + "3790 func physio .tsv.gz {} \n", + "3786 func bold .nii.gz {} \n", + "\n", + " meta__json \\\n", + "3788 {'CogAtlasID': 'https://www.cognitiveatlas.org... \n", + "3790 {'StartTime': 0, 'SamplingFrequency': 100, 'Co... \n", + "3786 {'CogAtlasID': 'https://www.cognitiveatlas.org... \n", + "\n", + " finfo__file_path finfo__link_target \\\n", + "3788 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "3790 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "3786 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "\n", + " finfo__mod_time \n", + "3788 1.691420e+09 \n", + "3790 1.691420e+09 \n", + "3786 1.691420e+09 " ] }, - "execution_count": 8, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_drop = df_multi.droplevel(0, axis=1)\n", - "\n", - "df_drop.head(3)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Columns and types\n", - "\n", - "Now let's look at the column names and pandas types.\n", - "\n", - "> TODO: not all types are preserved when converting parquet to pandas. In particular, strings are mapped to objects and ints with `None` to float with `NaN`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape: (10266, 40)\n", - "Columns:\n", - " ds__dataset: object\n", - " ds__dataset_type: object\n", - " ds__dataset_path: object\n", - " ds__dataset_description: json\n", - " ent__sub: object\n", - " ent__ses: object\n", - " ent__sample: object\n", - " ent__task: object\n", - " ent__acq: object\n", - " ent__ce: object\n", - " ent__trc: object\n", - " ent__stain: object\n", - " ent__rec: object\n", - " ent__dir: object\n", - " ent__run: float64\n", - " ent__mod: object\n", - " ent__echo: float64\n", - " ent__flip: float64\n", - " ent__inv: float64\n", - " ent__mt: object\n", - " ent__part: object\n", - " ent__proc: object\n", - " ent__hemi: object\n", - " ent__space: object\n", - " ent__split: float64\n", - " ent__recording: object\n", - " ent__chunk: float64\n", - " ent__atlas: object\n", - " ent__res: object\n", - " ent__den: object\n", - " ent__label: object\n", - " ent__desc: object\n", - " ent__datatype: object\n", - " ent__suffix: object\n", - " ent__ext: object\n", - " ent__extra_entities: json\n", - " meta__json: json\n", - " file__file_path: object\n", - " file__link_target: object\n", - " file__mod_time: float64\n" - ] - } - ], - "source": [ - "print(f\"Shape: \", df.shape)\n", - "print(\n", - " \"Columns:\\n\"\n", - " + \"\\n\".join(f\" {name}: {typ}\" for name, typ in df.dtypes.to_dict().items())\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most columns are numeric (`float64`) or string (`object`) type. However there are some columns (`ds__dataset_description`, `ent__extra_entities`, `meta__json`) which use the elbow extension `json` type for arbitrary nested dicts." + "sort_tab = tab.sort_entities([\"dataset\", \"sub\", \"ses\", \"task\", \"run\"])\n", + "sort_tab.head(3)" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Sorting rows\n", + "### Filtering\n", "\n", - "By default the rows are in arbitrary order. We can sort the values in place." + "In addition to all the usual pandas slicing operations, `BIDSTable`s also support higher-level filtering operations inspired by the PyBIDS `BIDSLayout.get` method and the pandas `Series.filter` method." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1263,72 +2261,115 @@ " ent__ext\n", " ent__extra_entities\n", " meta__json\n", - " file__file_path\n", - " file__link_target\n", - " file__mod_time\n", + " finfo__file_path\n", + " finfo__link_target\n", + " finfo__mod_time\n", " \n", " \n", " \n", " \n", - " 9284\n", - " 7t_trt\n", - " None\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'BIDSVersion': '1.8.0', 'Name': '7t_trt'}\n", - " 01\n", - " 1\n", + " 1554\n", + " synthetic/derivatives/fmriprep\n", + " derivative\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'fMRIPrep - fMRI PREProcessing workfl...\n", + " 04\n", + " 02\n", " None\n", " rest\n", - " fullbrain\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " 1.0\n", " None\n", " NaN\n", + " None\n", + " NaN\n", + " NaN\n", + " NaN\n", + " None\n", + " None\n", + " None\n", + " None\n", + " T1w\n", " NaN\n", + " None\n", " NaN\n", " None\n", " None\n", " None\n", " None\n", + " preproc\n", + " func\n", + " bold\n", + " .nii\n", + " {}\n", + " {'Sources': ['bids:raw:sub-04/ses-02/sub-04_se...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", + " \n", + " \n", + " 1567\n", + " synthetic/derivatives/fmriprep\n", + " derivative\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'fMRIPrep - fMRI PREProcessing workfl...\n", + " 04\n", + " 02\n", + " None\n", + " rest\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", " None\n", " NaN\n", + " NaN\n", + " NaN\n", + " None\n", + " None\n", " None\n", + " None\n", + " MNI152NLin2009cAsym\n", " NaN\n", " None\n", + " NaN\n", " None\n", " None\n", " None\n", " None\n", + " preproc\n", " func\n", " bold\n", - " .nii.gz\n", + " .nii\n", " {}\n", - " {'CogAtlasID': 'https://www.cognitiveatlas.org...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'Sources': ['bids:raw:sub-04/ses-02/sub-04_se...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", - " 9286\n", - " 7t_trt\n", - " None\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'BIDSVersion': '1.8.0', 'Name': '7t_trt'}\n", + " 1576\n", + " synthetic/derivatives/fmriprep\n", + " derivative\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'fMRIPrep - fMRI PREProcessing workfl...\n", + " 04\n", " 01\n", - " 1\n", " None\n", " rest\n", - " fullbrain\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " 1.0\n", + " None\n", + " NaN\n", " None\n", " NaN\n", " NaN\n", @@ -1337,41 +2378,127 @@ " None\n", " None\n", " None\n", + " T1w\n", + " NaN\n", + " None\n", + " NaN\n", + " None\n", + " None\n", + " None\n", + " None\n", + " preproc\n", + " func\n", + " bold\n", + " .nii\n", + " {}\n", + " {'Sources': ['bids:raw:sub-04/ses-01/sub-04_se...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", + " \n", + " \n", + " 1579\n", + " synthetic/derivatives/fmriprep\n", + " derivative\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'fMRIPrep - fMRI PREProcessing workfl...\n", + " 04\n", + " 01\n", + " None\n", + " rest\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", " None\n", " NaN\n", + " NaN\n", + " NaN\n", + " None\n", + " None\n", " None\n", + " None\n", + " MNI152NLin2009cAsym\n", " NaN\n", " None\n", + " NaN\n", " None\n", " None\n", " None\n", " None\n", + " preproc\n", " func\n", - " physio\n", - " .tsv.gz\n", + " bold\n", + " .nii\n", " {}\n", - " {'StartTime': 0, 'SamplingFrequency': 100, 'Co...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'Sources': ['bids:raw:sub-04/ses-01/sub-04_se...\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", - " 9282\n", - " 7t_trt\n", + " 4222\n", + " synthetic\n", + " raw\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'Synthetic dataset for inclusion in B...\n", + " 04\n", + " 02\n", " None\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", - " {'BIDSVersion': '1.8.0', 'Name': '7t_trt'}\n", + " rest\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " None\n", + " NaN\n", + " NaN\n", + " NaN\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " NaN\n", + " None\n", + " NaN\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " func\n", + " bold\n", + " .nii\n", + " {}\n", + " {'TaskName': 'Rest', 'RepetitionTime': 2.5}\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " None\n", + " 1.691420e+09\n", + " \n", + " \n", + " 4235\n", + " synthetic\n", + " raw\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", + " {'Name': 'Synthetic dataset for inclusion in B...\n", + " 04\n", " 01\n", - " 1\n", " None\n", " rest\n", - " fullbrain\n", " None\n", " None\n", " None\n", " None\n", " None\n", - " 2.0\n", + " None\n", + " NaN\n", " None\n", " NaN\n", " NaN\n", @@ -1391,85 +2518,211 @@ " None\n", " func\n", " bold\n", - " .nii.gz\n", + " .nii\n", " {}\n", - " {'CogAtlasID': 'https://www.cognitiveatlas.org...\n", - " /Users/clane/Projects/ScalableQC/code/bids2tab...\n", + " {'TaskName': 'Rest', 'RepetitionTime': 2.5}\n", + " /Users/clane/Projects/B2T/bids2table/bids-exam...\n", " None\n", - " 1.687883e+09\n", + " 1.691420e+09\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ds__dataset ds__dataset_type \n", - "9284 7t_trt None \\\n", - "9286 7t_trt None \n", - "9282 7t_trt None \n", - "\n", - " ds__dataset_path \n", - "9284 /Users/clane/Projects/ScalableQC/code/bids2tab... \\\n", - "9286 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "9282 /Users/clane/Projects/ScalableQC/code/bids2tab... \n", - "\n", - " ds__dataset_description ent__sub ent__ses \n", - "9284 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \\\n", - "9286 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", - "9282 {'BIDSVersion': '1.8.0', 'Name': '7t_trt'} 01 1 \n", - "\n", - " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \n", - "9284 None rest fullbrain None None None None \\\n", - "9286 None rest fullbrain None None None None \n", - "9282 None rest fullbrain None None None None \n", - "\n", - " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \n", - "9284 None 1.0 None NaN NaN NaN None \\\n", - "9286 None 1.0 None NaN NaN NaN None \n", - "9282 None 2.0 None NaN NaN NaN None \n", - "\n", - " ent__part ent__proc ent__hemi ent__space ent__split ent__recording \n", - "9284 None None None None NaN None \\\n", - "9286 None None None None NaN None \n", - "9282 None None None None NaN None \n", - "\n", - " ent__chunk ent__atlas ent__res ent__den ent__label ent__desc \n", - "9284 NaN None None None None None \\\n", - "9286 NaN None None None None None \n", - "9282 NaN None None None None None \n", - "\n", - " ent__datatype ent__suffix ent__ext ent__extra_entities \n", - "9284 func bold .nii.gz {} \\\n", - "9286 func physio .tsv.gz {} \n", - "9282 func bold .nii.gz {} \n", - "\n", - " meta__json \n", - "9284 {'CogAtlasID': 'https://www.cognitiveatlas.org... \\\n", - "9286 {'StartTime': 0, 'SamplingFrequency': 100, 'Co... \n", - "9282 {'CogAtlasID': 'https://www.cognitiveatlas.org... \n", - "\n", - " file__file_path file__link_target \n", - "9284 /Users/clane/Projects/ScalableQC/code/bids2tab... None \\\n", - "9286 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", - "9282 /Users/clane/Projects/ScalableQC/code/bids2tab... None \n", - "\n", - " file__mod_time \n", - "9284 1.687883e+09 \n", - "9286 1.687883e+09 \n", - "9282 1.687883e+09 " + " ds__dataset ds__dataset_type \\\n", + "1554 synthetic/derivatives/fmriprep derivative \n", + "1567 synthetic/derivatives/fmriprep derivative \n", + "1576 synthetic/derivatives/fmriprep derivative \n", + "1579 synthetic/derivatives/fmriprep derivative \n", + "4222 synthetic raw \n", + "4235 synthetic raw \n", + "\n", + " ds__dataset_path \\\n", + "1554 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1567 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1576 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "1579 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "4222 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "4235 /Users/clane/Projects/B2T/bids2table/bids-exam... \n", + "\n", + " ds__dataset_description ent__sub ent__ses \\\n", + "1554 {'Name': 'fMRIPrep - fMRI PREProcessing workfl... 04 02 \n", + "1567 {'Name': 'fMRIPrep - fMRI PREProcessing workfl... 04 02 \n", + "1576 {'Name': 'fMRIPrep - fMRI PREProcessing workfl... 04 01 \n", + "1579 {'Name': 'fMRIPrep - fMRI PREProcessing workfl... 04 01 \n", + "4222 {'Name': 'Synthetic dataset for inclusion in B... 04 02 \n", + "4235 {'Name': 'Synthetic dataset for inclusion in B... 04 01 \n", + "\n", + " ent__sample ent__task ent__acq ent__ce ent__trc ent__stain ent__rec \\\n", + "1554 None rest None None None None None \n", + "1567 None rest None None None None None \n", + "1576 None rest None None None None None \n", + "1579 None rest None None None None None \n", + "4222 None rest None None None None None \n", + "4235 None rest None None None None None \n", + "\n", + " ent__dir ent__run ent__mod ent__echo ent__flip ent__inv ent__mt \\\n", + "1554 None NaN None NaN NaN NaN None \n", + "1567 None NaN None NaN NaN NaN None \n", + "1576 None NaN None NaN NaN NaN None \n", + "1579 None NaN None NaN NaN NaN None \n", + "4222 None NaN None NaN NaN NaN None \n", + "4235 None NaN None NaN NaN NaN None \n", + "\n", + " ent__part ent__proc ent__hemi ent__space ent__split \\\n", + "1554 None None None T1w NaN \n", + "1567 None None None MNI152NLin2009cAsym NaN \n", + "1576 None None None T1w NaN \n", + "1579 None None None MNI152NLin2009cAsym NaN \n", + "4222 None None None None NaN \n", + "4235 None None None None NaN \n", + "\n", + " ent__recording ent__chunk ent__atlas ent__res ent__den ent__label \\\n", + "1554 None NaN None None None None \n", + "1567 None NaN None None None None \n", + "1576 None NaN None None None None \n", + "1579 None NaN None None None None \n", + "4222 None NaN None None None None \n", + "4235 None NaN None None None None \n", + "\n", + " ent__desc ent__datatype ent__suffix ent__ext ent__extra_entities \\\n", + "1554 preproc func bold .nii {} \n", + "1567 preproc func bold .nii {} \n", + "1576 preproc func bold .nii {} \n", + "1579 preproc func bold .nii {} \n", + "4222 None func bold .nii {} \n", + "4235 None func bold .nii {} \n", + "\n", + " meta__json \\\n", + "1554 {'Sources': ['bids:raw:sub-04/ses-02/sub-04_se... \n", + "1567 {'Sources': ['bids:raw:sub-04/ses-02/sub-04_se... \n", + "1576 {'Sources': ['bids:raw:sub-04/ses-01/sub-04_se... \n", + "1579 {'Sources': ['bids:raw:sub-04/ses-01/sub-04_se... \n", + "4222 {'TaskName': 'Rest', 'RepetitionTime': 2.5} \n", + "4235 {'TaskName': 'Rest', 'RepetitionTime': 2.5} \n", + "\n", + " finfo__file_path finfo__link_target \\\n", + "1554 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "1567 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "1576 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "1579 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "4222 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "4235 /Users/clane/Projects/B2T/bids2table/bids-exam... None \n", + "\n", + " finfo__mod_time \n", + "1554 1.691420e+09 \n", + "1567 1.691420e+09 \n", + "1576 1.691420e+09 \n", + "1579 1.691420e+09 \n", + "4222 1.691420e+09 \n", + "4235 1.691420e+09 " ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sort_cols = [\"ds__dataset\"] + [f\"ent__{k}\" for k in [\"sub\", \"ses\", \"task\", \"run\"]]\n", + "filtered = (\n", + " tab\n", + " .filter(\"task\", contains=\"rest\")\n", + " .filter(\"sub\", items=[\"04\", \"08\"])\n", + " .filter(\"RepetitionTime\", 2.5)\n", + ")\n", + "\n", + "filtered" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also apply multiple filters at the same time with `filter_multi`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filters equal: True\n" + ] + } + ], + "source": [ + "filtered2 = tab.filter_multi(\n", + " task={\"contains\": \"rest\"},\n", + " sub={\"items\": [\"04\", \"08\"]},\n", + " RepetitionTime=2.5,\n", + ")\n", + "\n", + "print(\"Filters equal:\", filtered.equals(filtered2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting files\n", + "\n", + "The rows of the table can also be converted to a list of structured `BIDSFile`s." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First file: BIDSFile(dataset='synthetic/derivatives/fmriprep', root=PosixPath('/Users/clane/Projects/B2T/bids2table/bids-examples/synthetic/derivatives/fmriprep'), path=PosixPath('/Users/clane/Projects/B2T/bids2table/bids-examples/synthetic/derivatives/fmriprep/sub-04/ses-02/func/sub-04_ses-02_task-rest_space-T1w_desc-preproc_bold.nii'), entities=BIDSEntities(sub='04', ses='02', sample=None, task='rest', acq=None, ce=None, trc=None, stain=None, rec=None, dir=None, run=None, mod=None, echo=None, flip=None, inv=None, mt=None, part=None, proc=None, hemi=None, space='T1w', split=None, recording=None, chunk=None, atlas=None, res=None, den=None, label=None, desc='preproc', datatype='func', suffix='bold', ext='.nii', extra_entities={}), metadata={'Sources': ['bids:raw:sub-04/ses-02/sub-04_ses-02_task-rest_bold.nii'], 'TaskName': 'Rest', 'RepetitionTime': 2.5})\n" + ] + } + ], + "source": [ + "files = filtered.files\n", "\n", - "df.sort_values(sort_cols, inplace=True)\n", + "print(\"First file:\", files[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File paths:\n", + "sub-04/ses-02/func/sub-04_ses-02_task-rest_space-T1w_desc-preproc_bold.nii\n", + "sub-04/ses-02/func/sub-04_ses-02_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "sub-04/ses-01/func/sub-04_ses-01_task-rest_space-T1w_desc-preproc_bold.nii\n", + "sub-04/ses-01/func/sub-04_ses-01_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold.nii\n", + "sub-04/ses-02/func/sub-04_ses-02_task-rest_bold.nii\n", + "sub-04/ses-01/func/sub-04_ses-01_task-rest_bold.nii\n" + ] + } + ], + "source": [ + "print(\"File paths:\\n\", \"\\n\".join([str(f.relative_path) for f in files]), sep=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze the table\n", "\n", - "df.head(3)" + "Next we'll do some more detailed analysis of the table to demonstrate some of the more advanced manipulation that's possible." ] }, { @@ -1484,7 +2737,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1525,13 +2778,13 @@ "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ent_counts = entities.count(axis=0)\n", + "ent_counts = tab.ent.count(axis=0)\n", "ent_counts" ] }, @@ -1545,7 +2798,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1559,7 +2812,7 @@ "dtype: int64" ] }, - "execution_count": 12, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1580,7 +2833,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -2163,13 +3416,13 @@ "synthetic/derivatives/fmriprep 150 60" ] }, - "execution_count": 13, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_drop.groupby(\"dataset\").agg(\n", + "tab.flat.groupby(\"dataset\").agg(\n", " {\"file_path\": \"count\", \"json\": \"count\"}\n", ")" ] @@ -2186,7 +3439,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2233,22 +3486,22 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "176it [00:01, 89.47it/s, tot=176, good=176, rec=2245, err=0] \n", - "203it [00:02, 90.18it/s, tot=203, good=203, rec=2630, err=0]]\n", - "197it [00:02, 87.01it/s, tot=197, good=197, rec=2663, err=0]\n", - "204it [00:02, 88.66it/s, tot=204, good=204, rec=2728, err=0] \n" + "172it [00:00, 327.64it/s, tot=172, good=172, rec=2240, err=0]\n", + "193it [00:00, 349.64it/s, tot=193, good=193, rec=2386, err=0]\n", + "213it [00:00, 333.32it/s, tot=213, good=213, rec=2812, err=0]\n", + "202it [00:00, 315.47it/s, tot=202, good=202, rec=2828, err=0]\n" ] } ], "source": [ - "! bids2table -x -w 4 ../bids-examples/" + "! bids2table -o bids-examples.b2t -x -w 4 ../bids-examples/" ] }, { @@ -2262,10 +3515,10 @@ "```bash\n", "# Can't use --overwrite together with --worker_id\n", "# Remove in advance\n", - "rm -r ../bids-examples/index.b2t\n", + "rm -r bids-examples.b2t\n", "\n", "for worker_id in {0..3}; do\n", - " bids2table --worker_id $worker_id --workers 4 ../bids-examples/ &\n", + " bids2table -o bids-examples.b2t --worker_id $worker_id --workers 4 ../bids-examples/ &\n", "done\n", "```" ] @@ -2287,7 +3540,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index bbe40ae..5cf3eb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ authors = [ {name = "Connor Lane", email = "connor.lane858@gmail.com"}, ] readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.8" license = {text = "MIT License"} classifiers = [ "Development Status :: 3 - Alpha", diff --git a/tests/test_bids2table.py b/tests/test_bids2table.py index 2f18536..8ef1607 100644 --- a/tests/test_bids2table.py +++ b/tests/test_bids2table.py @@ -28,23 +28,23 @@ def empty_dataset(tmp_path: Path) -> Path: @pytest.mark.parametrize("persistent", [False, True]) def test_bids2table(tmp_path: Path, persistent: bool): root = BIDS_EXAMPLES / "ds001" - output = tmp_path / "index.b2t" + index_path = tmp_path / "index.b2t" - df = bids2table(root=root, persistent=persistent, output=output) - assert df.shape == (128, 40) + tab = bids2table(root=root, persistent=persistent, index_path=index_path) + assert tab.shape == (128, 40) # Reload from cache - df2 = bids2table(root=root, persistent=persistent, output=output) - assert df.equals(df2) + tab2 = bids2table(root=root, persistent=persistent, index_path=index_path) + assert tab.equals(tab2) def test_bids2table_empty(empty_dataset: Path): - df = bids2table(root=empty_dataset, persistent=True) - assert df.shape == (0, 0) + tab = bids2table(root=empty_dataset, persistent=True) + assert tab.shape == (0, 0) # Reload from cache - df2 = bids2table(root=empty_dataset) - assert df.equals(df2) + tab2 = bids2table(root=empty_dataset) + assert tab.equals(tab2) def test_bids2table_nonexist(tmp_path: Path): diff --git a/tests/test_extractors/test_entities.py b/tests/test_entities.py similarity index 97% rename from tests/test_extractors/test_entities.py rename to tests/test_entities.py index 9105fa4..5bd94c0 100644 --- a/tests/test_extractors/test_entities.py +++ b/tests/test_entities.py @@ -6,7 +6,7 @@ import pytest from pytest import FixtureRequest -from bids2table.extractors.entities import BIDSEntities, parse_bids_entities +from bids2table.entities import BIDSEntities, parse_bids_entities EXAMPLES = ( ( diff --git a/tests/test_helpers.py b/tests/test_helpers.py deleted file mode 100644 index 8515650..0000000 --- a/tests/test_helpers.py +++ /dev/null @@ -1,116 +0,0 @@ -from typing import Any, Dict, Optional - -import pandas as pd -import pytest - -from bids2table.helpers import ( - flat_to_multi_columns, - join_bids_path, - multi_to_flat_columns, -) - - -@pytest.mark.parametrize( - "row,prefix,valid_only,expected", - [ - ( - {"sub": "A01", "ses": "b", "run": 2, "suffix": "bold", "ext": ".json"}, - None, - False, - "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", - ), - ( - {"sub": "A01", "ses": "b", "run": 2, "suffix": "bold", "ext": ".json"}, - "dataset", - False, - "dataset/sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", - ), - ( - { - "sub": "A01", - "ses": "b", - "run": 2, - "extraKey": 1, - "suffix": "bold", - "ext": ".json", - }, - None, - False, - "sub-A01/ses-b/sub-A01_ses-b_run-2_extraKey-1_bold.json", - ), - ( - { - "sub": "A01", - "ses": "b", - "run": 2, - "extraKey": 1, - "suffix": "bold", - "ext": ".json", - }, - None, - True, - "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", - ), - ( - { - "entities": { - "sub": "A01", - "ses": "b", - "run": 2, - "suffix": "bold", - "ext": ".json", - } - }, - None, - False, - "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", - ), - ( - pd.concat( - [ - pd.Series( - { - "sub": "A01", - "ses": "b", - "run": 2, - "suffix": "bold", - "ext": ".json", - } - ) - ], - keys=["entities"], - ), - None, - False, - "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", - ), - ], -) -def test_join_bids_path( - row: Dict[str, Any], prefix: Optional[str], valid_only: bool, expected: str -): - path = join_bids_path(row, prefix=prefix, valid_only=valid_only) - assert str(path) == expected - - -@pytest.mark.parametrize("sep", ["__", "."]) -def test_flat_to_multi_columns(sep: str): - df = pd.DataFrame( - { - f"A{sep}a": [1, 2, 3], - f"A{sep}b": ["a", "b", "c"], - f"B{sep}a": [4, 5, 6], - f"B{sep}b": ["d", "e", "f"], - } - ) - multi_index = pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]) - - df_multi = flat_to_multi_columns(df, sep=sep) - assert df_multi.columns.equals(multi_index) - - df_flat = multi_to_flat_columns(df_multi, sep=sep) - assert df_flat.equals(df) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 0000000..df2d60d --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,207 @@ +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import pandas as pd +import pytest + +from bids2table import bids2table +from bids2table.table import ( + BIDSTable, + flat_to_multi_columns, + join_bids_path, + multi_to_flat_columns, +) + +BIDS_EXAMPLES = Path(__file__).parent.parent / "bids-examples" + + +@pytest.fixture(scope="module") +def tab() -> BIDSTable: + return bids2table(BIDS_EXAMPLES / "ds001") + + +def test_table(tab: BIDSTable): + assert tab.shape == (128, 40) + + groups = tab.nested.columns.unique(0).tolist() + assert groups == ["ds", "ent", "meta", "finfo"] + + assert tab.ds.shape == (128, 4) + assert tab.ent.shape == (128, 32) + assert tab.meta.shape == (128, 1) + assert tab.flat_meta.shape == (128, 2) + assert tab.finfo.shape == (128, 3) + + subtab: BIDSTable = tab.iloc[:10] + assert subtab.ds.shape == (10, 4) + + assert len(tab.datatypes) == 2 + assert len(tab.modalities) == 1 + assert len(tab.subjects) == 16 + assert len(tab.entities) == 3 + + +def test_table_files(tab: BIDSTable): + files = tab.files + assert len(files) == 128 + + file = files[0] + assert file.path.exists() + assert (file.root / file.relative_path).exists() + assert file.metadata == {} + + +@pytest.mark.parametrize( + "key,filter,expected_count", + [ + ("sub", {"value": "04"}, 8), + ("subject", {"value": "04"}, 8), + ("RepetitionTime", {"value": 2.0}, 48), + ("subject", {"value": "04"}, 8), + ("sub", {"items": ["04", "06"]}, 16), + ("sub", {"contains": "4"}, 16), + ("sub", {"regex": "0[456]"}, 24), + ("RepetitionTime", {"func": lambda v: v <= 2.0}, 48), + ], +) +def test_table_filter( + tab: BIDSTable, key: str, filter: Dict[str, Any], expected_count: int +): + subtab = tab.filter(key, **filter) + assert isinstance(subtab, BIDSTable) + assert len(subtab) == expected_count + + +@pytest.mark.parametrize( + "filters,expected_count", + [ + ( + { + "dataset": "ds001", + "sub": {"items": ["04", "06"]}, + "RepetitionTime": {"value": 2.0}, + }, + 6, + ) + ], +) +def test_table_filter_multi( + tab: BIDSTable, filters: Dict[str, Any], expected_count: int +): + subtab = tab.filter_multi(**filters) + assert isinstance(subtab, BIDSTable) + assert len(subtab) == expected_count + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("by", ["sub", ["subject"], ["dataset", "sub"]]) +def test_table_sort_entities(tab: BIDSTable, by: Union[str, List[str]], inplace: bool): + tab = tab.copy() + sort_tab = tab.sort_entities(by, inplace=inplace) + assert isinstance(sort_tab, BIDSTable) + assert len(sort_tab) == len(tab) + assert sort_tab.subjects == sorted(tab.subjects) + + +@pytest.mark.parametrize("sep", ["__", "."]) +def test_flat_to_multi_columns(sep: str): + df = pd.DataFrame( + { + f"A{sep}a": [1, 2, 3], + f"A{sep}b": ["a", "b", "c"], + f"B{sep}a": [4, 5, 6], + f"B{sep}b": ["d", "e", "f"], + } + ) + multi_index = pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]) + + df_multi = flat_to_multi_columns(df, sep=sep) + assert df_multi.columns.equals(multi_index) + + df_flat = multi_to_flat_columns(df_multi, sep=sep) + assert df_flat.equals(df) + + +@pytest.mark.parametrize( + "entities,prefix,valid_only,expected", + [ + ( + {"sub": "A01", "ses": "b", "run": 2, "suffix": "bold", "ext": ".json"}, + None, + False, + "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", + ), + ( + {"sub": "A01", "ses": "b", "run": 2, "suffix": "bold", "ext": ".json"}, + "dataset", + False, + "dataset/sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", + ), + ( + { + "sub": "A01", + "ses": "b", + "run": 2, + "extraKey": 1, + "suffix": "bold", + "ext": ".json", + }, + None, + False, + "sub-A01/ses-b/sub-A01_ses-b_run-2_extraKey-1_bold.json", + ), + ( + { + "sub": "A01", + "ses": "b", + "run": 2, + "extraKey": 1, + "suffix": "bold", + "ext": ".json", + }, + None, + True, + "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", + ), + ( + pd.Series( + { + "sub": "A01", + "ses": "b", + "run": 2, + "suffix": "bold", + "ext": ".json", + } + ), + None, + False, + "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", + ), + # Make sure it still works if applied to the raw df + ( + { + "ds__dataset": "ds001", + "ent__sub": "A01", + "ent__ses": "b", + "ent__run": 2, + "suffix": "bold", + "ext": ".json", + }, + None, + False, + "sub-A01/ses-b/sub-A01_ses-b_run-2_bold.json", + ), + ], +) +def test_join_bids_path( + entities: Union[Dict[str, Any], pd.Series], + prefix: Optional[str], + valid_only: bool, + expected: str, +): + path = join_bids_path(entities, prefix=prefix, valid_only=valid_only) + assert str(path) == expected + + +if __name__ == "__main__": + pytest.main([__file__])