Higher-level API (#17)

Add a higher level `BIDSTable` interface inspired by the [proposed PyBIDS API redesign](bids-standard/pybids#989). * Move entities module one level up * Move `join_bids_path()` helper into `entities` * Add `BIDSTable` subclass of `DataFrame` Add `BIDSTable` subclass of `DataFrame` with convenience methods for accessing subtables and flattened metadata. * Add long names to entities field metadata * Add table `filter()` method Add `BIDSTable.filter()` which filters rows according to a condition applied to a single column. The supported conditions follow `pandas.Series.filter()`. * Add `files` property returning list of `BIDSFile`s Also change `file` column group to `finfo` to try to limit possible confusion. * Update example and bug fixes Bug fixes: - Set the index of `flat_metadata` to the parent table's index. - Treat NA in the row mask as False in `filter()`. * Add properties for subjects, datatypes, etc * Add `sort_entities()` * Upgrade required python to >=3.8 * Add `filter_multi` method and documentation PyBIDS supports querying a layout with multiple filters specified as keyword arguments. This is a nice interface, and is also useful for programmatic filtering. Here we add a `filter_multi()` method to do something similar. * Flatten JSON metadata only to first level * Fix mypy error * Move some things around * Add `func` arg to `filter()` Add a `func` arg option to `filter` for arbitrary lambda function filtering. Also move `join_bids_path()` into the `table` module. * More moving around * Don't use `removeprefix` * Yet more moving around * Update example * Add a comment on the filter api * Change arg name output -> index_path Having the argument be `output` in `bids2table` was confusing when you only want to load a table.
childmindresearch · Aug 9, 2023 · 918fcf4 · 918fcf4
1 parent 0659536
commit 918fcf4
Show file tree

Hide file tree

Showing 19 changed files with 2,521 additions and 743 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -13,10 +13,10 @@ jobs:
       - uses: actions/checkout@v3
         with:
           submodules: 'true'
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v3
         with:
-          python-version: "3.7"
+          python-version: "3.8"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/.gitignore b/.gitignore
@@ -23,8 +23,9 @@ htmlcov
 .vscode/
 .env
 
-# Local scratch
+# Local data and scratch
 .scratch
+example/bids-examples.b2t
 
 # Local environment
 .venv

diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -1,6 +1,20 @@
 """
-Efficiently index large-scale BIDS datasets and derivatives
+Efficiently index and query large-scale BIDS datasets and derivatives.
 """
 
-from ._bids2table import bids2table  # noqa
+# Register elbow extension types
+import elbow.dtypes  # noqa
+
+from ._b2t import bids2table
 from ._version import __version__, __version_tuple__  # noqa
+from .entities import BIDSEntities, parse_bids_entities
+from .table import BIDSFile, BIDSTable, join_bids_path
+
+__all__ = [
+    "bids2table",
+    "BIDSTable",
+    "BIDSFile",
+    "BIDSEntities",
+    "parse_bids_entities",
+    "join_bids_path",
+]
diff --git a/bids2table/__main__.py b/bids2table/__main__.py
@@ -57,12 +57,12 @@ def main():
     bids2table(
         root=args.root,
         persistent=True,
-        output=args.output,
+        index_path=args.output,
         incremental=args.incremental,
         overwrite=args.overwrite,
         workers=args.workers,
         worker_id=args.worker_id,
-        return_df=False,
+        return_table=False,
     )
 
 

diff --git a/bids2table/_bids2table.py → bids2table/_b2t.py b/bids2table/_bids2table.py → bids2table/_b2t.py
@@ -2,13 +2,13 @@
 from pathlib import Path
 from typing import Optional
 
-import pandas as pd
 from elbow.builders import build_parquet, build_table
 from elbow.sources.filesystem import Crawler
 from elbow.typing import StrOrPath
 from elbow.utils import setup_logging
 
 from bids2table.extractors.bids import extract_bids_subdir
+from bids2table.table import BIDSTable
 
 setup_logging()
 
@@ -17,21 +17,21 @@ def bids2table(
     root: StrOrPath,
     *,
     persistent: bool = False,
-    output: Optional[StrOrPath] = None,
+    index_path: Optional[StrOrPath] = None,
     incremental: bool = False,
     overwrite: bool = False,
     workers: Optional[int] = None,
     worker_id: Optional[int] = None,
-    return_df: bool = True,
-) -> Optional[pd.DataFrame]:
+    return_table: bool = True,
+) -> Optional[BIDSTable]:
     """
     Index a BIDS dataset directory and load as a pandas DataFrame.
 
     Args:
         root: path to BIDS dataset
         persistent: whether to save index to disk as a Parquet dataset
-        output: path to output Parquet dataset directory if `persistent` is
-            `True`. Defaults to `root / "index.b2t".
+        index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
+            "index.b2t"`. Index generation requires `persistent=True`.
         incremental: update index incrementally with only new or changed files.
         overwrite: overwrite previous index.
         workers: number of parallel processes. If `None` or 1, run in the main
@@ -40,17 +40,19 @@ def bids2table(
         worker_id: optional worker ID to use when scheduling parallel tasks externally.
             Specifying the number of workers is required in this case. Incompatible with
             overwrite.
-        return_df: whether to return the dataframe or just build the persistent index.
+        return_table: whether to return the BIDS table or just build the persistent
+            index.
 
     Returns:
-        A DataFrame containing the BIDS Index.
+        A `BIDSTable` representing the indexed dataset(s), or `None` if `return_table`
+        is `False`.
     """
     if worker_id is not None and not persistent:
         raise ValueError(
             "worker_id is only supported when generating a persistent index"
         )
-    if not (return_df or persistent):
-        raise ValueError("persistent and return_df should not both be False")
+    if not (return_table or persistent):
+        raise ValueError("persistent and return_table should not both be False")
 
     root = Path(root).expanduser().resolve()
     if not root.is_dir():
@@ -64,37 +66,38 @@ def bids2table(
         follow_links=True,
     )
 
-    if output is None:
-        output = root / "index.b2t"
+    if index_path is None:
+        index_path = root / "index.b2t"
     else:
-        output = Path(output).expanduser().resolve()
+        index_path = Path(index_path).expanduser().resolve()
 
     stale = overwrite or incremental or worker_id is not None
-    if output.exists() and not stale:
-        if return_df:
-            logging.info("Loading cached index %s", output)
-            df = pd.read_parquet(output)
+    if index_path.exists() and not stale:
+        if return_table:
+            logging.info("Loading cached index %s", index_path)
+            tab = BIDSTable.from_parquet(index_path)
         else:
-            logging.info("Found cached index %s; nothing to do", output)
-            df = None
-        return df
+            logging.info("Found cached index %s; nothing to do", index_path)
+            tab = None
+        return tab
 
     if not persistent:
         logging.info("Building index in memory")
         df = build_table(source=source, extract=extract_bids_subdir)
-        return df
+        tab = BIDSTable.from_df(df)
+        return tab
 
     logging.info("Building persistent Parquet index")
     build_parquet(
         source=source,
         extract=extract_bids_subdir,
-        output=output,
+        output=index_path,
         incremental=incremental,
         overwrite=overwrite,
         workers=workers,
         worker_id=worker_id,
         path_column="file__file_path",
         mtime_column="file__mod_time",
     )
-    df = pd.read_parquet(output) if return_df else None
-    return df
+    tab = BIDSTable.from_parquet(index_path) if return_table else None
+    return tab
diff --git a/bids2table/extractors/entities.py → bids2table/entities.py b/bids2table/extractors/entities.py → bids2table/entities.py
@@ -1,9 +1,14 @@
+"""
+A structured representation for BIDS entities.
+"""
+
 import re
 import warnings
 from dataclasses import asdict, dataclass, field, fields
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, Optional, Union
+from types import MappingProxyType
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 import pandas as pd
 from elbow.typing import StrOrPath
@@ -26,6 +31,7 @@
 
 def bids_field(
     name: str,
+    display_name: str,
     required: bool = False,
     allowed_values: Optional[Iterable] = None,
     default: Optional[Any] = None,
@@ -35,9 +41,13 @@ def bids_field(
     BIDS entity dataclass field.
     """
     if allowed_values is not None:
-        allowed_values = set(allowed_values)
+        allowed_values = list(allowed_values)
 
-    metadata = dict(name=name, allowed_values=allowed_values)
+    metadata = {
+        "name": name,
+        "display_name": display_name,
+        "allowed_values": allowed_values,
+    }
     if required:
         fld = field(metadata=metadata)
     elif default_factory is not None:
@@ -60,48 +70,72 @@ class BIDSEntities:
         https://bids-specification.readthedocs.io/en/stable/appendices/entities.html
     """
 
-    sub: str = bids_field(name="Subject", required=True)
-    ses: Optional[str] = bids_field(name="Session")
-    sample: Optional[str] = bids_field(name="Sample")
-    task: Optional[str] = bids_field(name="Task")
-    acq: Optional[str] = bids_field(name="Acquisition")
-    ce: Optional[str] = bids_field(name="Contrast Enhancing Agent")
-    trc: Optional[str] = bids_field(name="Tracer")
-    stain: Optional[str] = bids_field(name="Stain")
-    rec: Optional[str] = bids_field(name="Reconstruction")
-    dir: Optional[str] = bids_field(name="Phase-Encoding Direction")
-    run: Optional[int] = bids_field(name="Run")
-    mod: Optional[str] = bids_field(name="Corresponding Modality")
-    echo: Optional[int] = bids_field(name="Echo")
-    flip: Optional[int] = bids_field(name="Flip Angle")
-    inv: Optional[int] = bids_field(name="Inversion Time")
+    sub: str = bids_field(name="subject", display_name="Subject", required=True)
+    ses: Optional[str] = bids_field(name="session", display_name="Session")
+    sample: Optional[str] = bids_field(name="sample", display_name="Sample")
+    task: Optional[str] = bids_field(name="task", display_name="Task")
+    acq: Optional[str] = bids_field(name="acquisition", display_name="Acquisition")
+    ce: Optional[str] = bids_field(
+        name="ceagent", display_name="Contrast Enhancing Agent"
+    )
+    trc: Optional[str] = bids_field(name="tracer", display_name="Tracer")
+    stain: Optional[str] = bids_field(name="stain", display_name="Stain")
+    rec: Optional[str] = bids_field(
+        name="reconstruction", display_name="Reconstruction"
+    )
+    dir: Optional[str] = bids_field(
+        name="direction", display_name="Phase-Encoding Direction"
+    )
+    run: Optional[int] = bids_field(name="run", display_name="Run")
+    mod: Optional[str] = bids_field(
+        name="modality", display_name="Corresponding Modality"
+    )
+    echo: Optional[int] = bids_field(name="echo", display_name="Echo")
+    flip: Optional[int] = bids_field(name="flip", display_name="Flip Angle")
+    inv: Optional[int] = bids_field(name="inversion", display_name="Inversion Time")
     mt: Optional[str] = bids_field(
-        name="Magnetization Transfer", allowed_values={"on", "off"}
+        name="mtransfer",
+        display_name="Magnetization Transfer",
+        allowed_values={"on", "off"},
     )
     part: Optional[str] = bids_field(
-        name="Part", allowed_values={"mag", "phase", "real", "imag"}
+        name="part",
+        display_name="Part",
+        allowed_values={"mag", "phase", "real", "imag"},
+    )
+    proc: Optional[str] = bids_field(
+        name="processing", display_name="Processed (on device)"
+    )
+    hemi: Optional[str] = bids_field(
+        name="hemisphere", display_name="Hemisphere", allowed_values={"L", "R"}
     )
-    proc: Optional[str] = bids_field(name="Processed (on device)")
-    hemi: Optional[str] = bids_field(name="Hemisphere", allowed_values={"L", "R"})
-    space: Optional[str] = bids_field(name="Space")
-    split: Optional[int] = bids_field(name="Split")
-    recording: Optional[str] = bids_field(name="Recording")
-    chunk: Optional[int] = bids_field(name="Chunk")
-    atlas: Optional[str] = bids_field(name="Atlas")
-    res: Optional[str] = bids_field(name="Resolution")
-    den: Optional[str] = bids_field(name="Density")
-    label: Optional[str] = bids_field(name="Label")
-    desc: Optional[str] = bids_field(name="Description")
+    space: Optional[str] = bids_field(name="space", display_name="Space")
+    split: Optional[int] = bids_field(name="split", display_name="Split")
+    recording: Optional[str] = bids_field(name="recording", display_name="Recording")
+    chunk: Optional[int] = bids_field(name="chunk", display_name="Chunk")
+    atlas: Optional[str] = bids_field(name="atlas", display_name="Atlas")
+    res: Optional[str] = bids_field(name="resolution", display_name="Resolution")
+    den: Optional[str] = bids_field(name="density", display_name="Density")
+    label: Optional[str] = bids_field(name="label", display_name="Label")
+    desc: Optional[str] = bids_field(name="description", display_name="Description")
     datatype: Optional[str] = bids_field(
-        name="Data type", allowed_values=BIDS_DATATYPES
+        name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES
     )
-    suffix: Optional[str] = bids_field(name="Suffix")
-    ext: Optional[str] = bids_field(name="Extension")
+    suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix")
+    ext: Optional[str] = bids_field(name="extension", display_name="Extension")
     extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field(
-        name="Extra entities",
+        name="extra_entities",
+        display_name="Extra entities",
         default_factory=dict,
     )
 
+    @staticmethod
+    def special() -> List[str]:
+        """
+        Get list of field keys which are not standard entities.
+        """
+        return ["datatype", "suffix", "ext", "extra_entities"]
+
     @classmethod
     def from_dict(cls, entities: Dict[str, Any], valid_only: bool = False):
         """
@@ -309,3 +343,8 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
         if v is not None:
             entities[k] = v
     return entities
+
+
+ENTITY_NAMES_TO_KEYS = MappingProxyType(
+    {f.metadata["name"]: f.name for f in fields(BIDSEntities)}
+)
diff --git a/bids2table/extractors/__init__.py b/bids2table/extractors/__init__.py
@@ -0,0 +1,3 @@
+"""
+[Elbow](https://github.com/cmi-dair/elbow) extract functions for BIDS datasets.
+"""
diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py
@@ -7,8 +7,9 @@
 from elbow.record import Record, concat
 from elbow.typing import StrOrPath
 
+from bids2table.entities import BIDSEntities
+
 from .dataset import extract_dataset
-from .entities import BIDSEntities
 from .metadata import extract_metadata, is_associated_sidecar
 
 
@@ -31,7 +32,7 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
     meta_rec = extract_metadata(path)
     file_rec = extract_file_meta(path)
 
-    rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec})
+    rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
     return rec
 
 

diff --git a/bids2table/extractors/image.py b/bids2table/extractors/image.py
@@ -7,7 +7,7 @@
 from elbow.typing import StrOrPath
 from nibabel.filebasedimages import ImageFileError
 
-from .entities import parse_bids_entities
+from bids2table.entities import parse_bids_entities
 
 try:
     import nifti
@@ -41,6 +41,9 @@ def extract_image_meta(path: StrOrPath, *, backend: str = "nibabel") -> Record:
 def _read_image_meta(
     path: str, backend: str = "nibabel"
 ) -> Tuple[Dict[str, Any], np.ndarray]:
+    header: Dict[str, Any]
+    affine: np.ndarray
+
     if backend == "nifti":
         if not has_nifti:
             raise ModuleNotFoundError("nifti image backend not installed")
@@ -51,7 +54,12 @@ def _read_image_meta(
         affine = None
     else:
         img = nib.load(path)
-        header = dict(img.header)
+        if not isinstance(img, nib.Nifti1Image):
+            raise TypeError(
+                f"Foung image type {type(img).__name__}; only Nifti1Image supported"
+            )
+
+        header = {k: v for k, v in img.header.items()}
         affine = np.asarray(img.affine)
 
     header = {k: _cast_header_value(v) for k, v in header.items()}