Skip to content

Commit

Permalink
Higher-level API (#17)
Browse files Browse the repository at this point in the history
Add a higher level `BIDSTable` interface inspired by the [proposed PyBIDS API redesign](bids-standard/pybids#989).

* Move entities module one level up

* Move `join_bids_path()` helper into `entities`

* Add `BIDSTable` subclass of `DataFrame`

Add `BIDSTable` subclass of `DataFrame` with convenience methods for
accessing subtables and flattened metadata.

* Add long names to entities field metadata

* Add table `filter()` method

Add `BIDSTable.filter()` which filters rows according to a condition
applied to a single column. The supported conditions follow
`pandas.Series.filter()`.

* Add `files` property returning list of `BIDSFile`s

Also change `file` column group to `finfo` to try to limit possible confusion.

* Update example and bug fixes

Bug fixes:

- Set the index of `flat_metadata` to the parent table's index.
- Treat NA in the row mask as False in `filter()`.

* Add properties for subjects, datatypes, etc

* Add `sort_entities()`

* Upgrade required python to >=3.8

* Add `filter_multi` method and documentation

PyBIDS supports querying a layout with multiple filters specified as
keyword arguments. This is a nice interface, and is also useful for
programmatic filtering. Here we add a `filter_multi()` method to do
something similar.

* Flatten JSON metadata only to first level

* Fix mypy error

* Move some things around

* Add `func` arg to `filter()`

Add a `func` arg option to `filter` for arbitrary lambda function
filtering.

Also move `join_bids_path()` into the `table` module.

* More moving around

* Don't use `removeprefix`

* Yet more moving around

* Update example

* Add a comment on the filter api

* Change arg name output -> index_path

Having the argument be `output` in `bids2table` was confusing when you
only want to load a table.
  • Loading branch information
clane9 authored Aug 9, 2023
1 parent 0659536 commit 918fcf4
Show file tree
Hide file tree
Showing 19 changed files with 2,521 additions and 743 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Set up Python 3.7
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.7"
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ htmlcov
.vscode/
.env

# Local scratch
# Local data and scratch
.scratch
example/bids-examples.b2t

# Local environment
.venv
Expand Down
18 changes: 16 additions & 2 deletions bids2table/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
"""
Efficiently index large-scale BIDS datasets and derivatives
Efficiently index and query large-scale BIDS datasets and derivatives.
"""

from ._bids2table import bids2table # noqa
# Register elbow extension types
import elbow.dtypes # noqa

from ._b2t import bids2table
from ._version import __version__, __version_tuple__ # noqa
from .entities import BIDSEntities, parse_bids_entities
from .table import BIDSFile, BIDSTable, join_bids_path

__all__ = [
"bids2table",
"BIDSTable",
"BIDSFile",
"BIDSEntities",
"parse_bids_entities",
"join_bids_path",
]
4 changes: 2 additions & 2 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ def main():
bids2table(
root=args.root,
persistent=True,
output=args.output,
index_path=args.output,
incremental=args.incremental,
overwrite=args.overwrite,
workers=args.workers,
worker_id=args.worker_id,
return_df=False,
return_table=False,
)


Expand Down
51 changes: 27 additions & 24 deletions bids2table/_bids2table.py → bids2table/_b2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from pathlib import Path
from typing import Optional

import pandas as pd
from elbow.builders import build_parquet, build_table
from elbow.sources.filesystem import Crawler
from elbow.typing import StrOrPath
from elbow.utils import setup_logging

from bids2table.extractors.bids import extract_bids_subdir
from bids2table.table import BIDSTable

setup_logging()

Expand All @@ -17,21 +17,21 @@ def bids2table(
root: StrOrPath,
*,
persistent: bool = False,
output: Optional[StrOrPath] = None,
index_path: Optional[StrOrPath] = None,
incremental: bool = False,
overwrite: bool = False,
workers: Optional[int] = None,
worker_id: Optional[int] = None,
return_df: bool = True,
) -> Optional[pd.DataFrame]:
return_table: bool = True,
) -> Optional[BIDSTable]:
"""
Index a BIDS dataset directory and load as a pandas DataFrame.
Args:
root: path to BIDS dataset
persistent: whether to save index to disk as a Parquet dataset
output: path to output Parquet dataset directory if `persistent` is
`True`. Defaults to `root / "index.b2t".
index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
"index.b2t"`. Index generation requires `persistent=True`.
incremental: update index incrementally with only new or changed files.
overwrite: overwrite previous index.
workers: number of parallel processes. If `None` or 1, run in the main
Expand All @@ -40,17 +40,19 @@ def bids2table(
worker_id: optional worker ID to use when scheduling parallel tasks externally.
Specifying the number of workers is required in this case. Incompatible with
overwrite.
return_df: whether to return the dataframe or just build the persistent index.
return_table: whether to return the BIDS table or just build the persistent
index.
Returns:
A DataFrame containing the BIDS Index.
A `BIDSTable` representing the indexed dataset(s), or `None` if `return_table`
is `False`.
"""
if worker_id is not None and not persistent:
raise ValueError(
"worker_id is only supported when generating a persistent index"
)
if not (return_df or persistent):
raise ValueError("persistent and return_df should not both be False")
if not (return_table or persistent):
raise ValueError("persistent and return_table should not both be False")

root = Path(root).expanduser().resolve()
if not root.is_dir():
Expand All @@ -64,37 +66,38 @@ def bids2table(
follow_links=True,
)

if output is None:
output = root / "index.b2t"
if index_path is None:
index_path = root / "index.b2t"
else:
output = Path(output).expanduser().resolve()
index_path = Path(index_path).expanduser().resolve()

stale = overwrite or incremental or worker_id is not None
if output.exists() and not stale:
if return_df:
logging.info("Loading cached index %s", output)
df = pd.read_parquet(output)
if index_path.exists() and not stale:
if return_table:
logging.info("Loading cached index %s", index_path)
tab = BIDSTable.from_parquet(index_path)
else:
logging.info("Found cached index %s; nothing to do", output)
df = None
return df
logging.info("Found cached index %s; nothing to do", index_path)
tab = None
return tab

if not persistent:
logging.info("Building index in memory")
df = build_table(source=source, extract=extract_bids_subdir)
return df
tab = BIDSTable.from_df(df)
return tab

logging.info("Building persistent Parquet index")
build_parquet(
source=source,
extract=extract_bids_subdir,
output=output,
output=index_path,
incremental=incremental,
overwrite=overwrite,
workers=workers,
worker_id=worker_id,
path_column="file__file_path",
mtime_column="file__mod_time",
)
df = pd.read_parquet(output) if return_df else None
return df
tab = BIDSTable.from_parquet(index_path) if return_table else None
return tab
109 changes: 74 additions & 35 deletions bids2table/extractors/entities.py → bids2table/entities.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""
A structured representation for BIDS entities.
"""

import re
import warnings
from dataclasses import asdict, dataclass, field, fields
from functools import lru_cache
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Optional, Union
from types import MappingProxyType
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

import pandas as pd
from elbow.typing import StrOrPath
Expand All @@ -26,6 +31,7 @@

def bids_field(
name: str,
display_name: str,
required: bool = False,
allowed_values: Optional[Iterable] = None,
default: Optional[Any] = None,
Expand All @@ -35,9 +41,13 @@ def bids_field(
BIDS entity dataclass field.
"""
if allowed_values is not None:
allowed_values = set(allowed_values)
allowed_values = list(allowed_values)

metadata = dict(name=name, allowed_values=allowed_values)
metadata = {
"name": name,
"display_name": display_name,
"allowed_values": allowed_values,
}
if required:
fld = field(metadata=metadata)
elif default_factory is not None:
Expand All @@ -60,48 +70,72 @@ class BIDSEntities:
https://bids-specification.readthedocs.io/en/stable/appendices/entities.html
"""

sub: str = bids_field(name="Subject", required=True)
ses: Optional[str] = bids_field(name="Session")
sample: Optional[str] = bids_field(name="Sample")
task: Optional[str] = bids_field(name="Task")
acq: Optional[str] = bids_field(name="Acquisition")
ce: Optional[str] = bids_field(name="Contrast Enhancing Agent")
trc: Optional[str] = bids_field(name="Tracer")
stain: Optional[str] = bids_field(name="Stain")
rec: Optional[str] = bids_field(name="Reconstruction")
dir: Optional[str] = bids_field(name="Phase-Encoding Direction")
run: Optional[int] = bids_field(name="Run")
mod: Optional[str] = bids_field(name="Corresponding Modality")
echo: Optional[int] = bids_field(name="Echo")
flip: Optional[int] = bids_field(name="Flip Angle")
inv: Optional[int] = bids_field(name="Inversion Time")
sub: str = bids_field(name="subject", display_name="Subject", required=True)
ses: Optional[str] = bids_field(name="session", display_name="Session")
sample: Optional[str] = bids_field(name="sample", display_name="Sample")
task: Optional[str] = bids_field(name="task", display_name="Task")
acq: Optional[str] = bids_field(name="acquisition", display_name="Acquisition")
ce: Optional[str] = bids_field(
name="ceagent", display_name="Contrast Enhancing Agent"
)
trc: Optional[str] = bids_field(name="tracer", display_name="Tracer")
stain: Optional[str] = bids_field(name="stain", display_name="Stain")
rec: Optional[str] = bids_field(
name="reconstruction", display_name="Reconstruction"
)
dir: Optional[str] = bids_field(
name="direction", display_name="Phase-Encoding Direction"
)
run: Optional[int] = bids_field(name="run", display_name="Run")
mod: Optional[str] = bids_field(
name="modality", display_name="Corresponding Modality"
)
echo: Optional[int] = bids_field(name="echo", display_name="Echo")
flip: Optional[int] = bids_field(name="flip", display_name="Flip Angle")
inv: Optional[int] = bids_field(name="inversion", display_name="Inversion Time")
mt: Optional[str] = bids_field(
name="Magnetization Transfer", allowed_values={"on", "off"}
name="mtransfer",
display_name="Magnetization Transfer",
allowed_values={"on", "off"},
)
part: Optional[str] = bids_field(
name="Part", allowed_values={"mag", "phase", "real", "imag"}
name="part",
display_name="Part",
allowed_values={"mag", "phase", "real", "imag"},
)
proc: Optional[str] = bids_field(
name="processing", display_name="Processed (on device)"
)
hemi: Optional[str] = bids_field(
name="hemisphere", display_name="Hemisphere", allowed_values={"L", "R"}
)
proc: Optional[str] = bids_field(name="Processed (on device)")
hemi: Optional[str] = bids_field(name="Hemisphere", allowed_values={"L", "R"})
space: Optional[str] = bids_field(name="Space")
split: Optional[int] = bids_field(name="Split")
recording: Optional[str] = bids_field(name="Recording")
chunk: Optional[int] = bids_field(name="Chunk")
atlas: Optional[str] = bids_field(name="Atlas")
res: Optional[str] = bids_field(name="Resolution")
den: Optional[str] = bids_field(name="Density")
label: Optional[str] = bids_field(name="Label")
desc: Optional[str] = bids_field(name="Description")
space: Optional[str] = bids_field(name="space", display_name="Space")
split: Optional[int] = bids_field(name="split", display_name="Split")
recording: Optional[str] = bids_field(name="recording", display_name="Recording")
chunk: Optional[int] = bids_field(name="chunk", display_name="Chunk")
atlas: Optional[str] = bids_field(name="atlas", display_name="Atlas")
res: Optional[str] = bids_field(name="resolution", display_name="Resolution")
den: Optional[str] = bids_field(name="density", display_name="Density")
label: Optional[str] = bids_field(name="label", display_name="Label")
desc: Optional[str] = bids_field(name="description", display_name="Description")
datatype: Optional[str] = bids_field(
name="Data type", allowed_values=BIDS_DATATYPES
name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES
)
suffix: Optional[str] = bids_field(name="Suffix")
ext: Optional[str] = bids_field(name="Extension")
suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix")
ext: Optional[str] = bids_field(name="extension", display_name="Extension")
extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field(
name="Extra entities",
name="extra_entities",
display_name="Extra entities",
default_factory=dict,
)

@staticmethod
def special() -> List[str]:
"""
Get list of field keys which are not standard entities.
"""
return ["datatype", "suffix", "ext", "extra_entities"]

@classmethod
def from_dict(cls, entities: Dict[str, Any], valid_only: bool = False):
"""
Expand Down Expand Up @@ -309,3 +343,8 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
if v is not None:
entities[k] = v
return entities


ENTITY_NAMES_TO_KEYS = MappingProxyType(
{f.metadata["name"]: f.name for f in fields(BIDSEntities)}
)
3 changes: 3 additions & 0 deletions bids2table/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
[Elbow](https://github.com/cmi-dair/elbow) extract functions for BIDS datasets.
"""
5 changes: 3 additions & 2 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from elbow.record import Record, concat
from elbow.typing import StrOrPath

from bids2table.entities import BIDSEntities

from .dataset import extract_dataset
from .entities import BIDSEntities
from .metadata import extract_metadata, is_associated_sidecar


Expand All @@ -31,7 +32,7 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
meta_rec = extract_metadata(path)
file_rec = extract_file_meta(path)

rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec})
rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
return rec


Expand Down
12 changes: 10 additions & 2 deletions bids2table/extractors/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from elbow.typing import StrOrPath
from nibabel.filebasedimages import ImageFileError

from .entities import parse_bids_entities
from bids2table.entities import parse_bids_entities

try:
import nifti
Expand Down Expand Up @@ -41,6 +41,9 @@ def extract_image_meta(path: StrOrPath, *, backend: str = "nibabel") -> Record:
def _read_image_meta(
path: str, backend: str = "nibabel"
) -> Tuple[Dict[str, Any], np.ndarray]:
header: Dict[str, Any]
affine: np.ndarray

if backend == "nifti":
if not has_nifti:
raise ModuleNotFoundError("nifti image backend not installed")
Expand All @@ -51,7 +54,12 @@ def _read_image_meta(
affine = None
else:
img = nib.load(path)
header = dict(img.header)
if not isinstance(img, nib.Nifti1Image):
raise TypeError(
f"Foung image type {type(img).__name__}; only Nifti1Image supported"
)

header = {k: v for k, v in img.header.items()}
affine = np.asarray(img.affine)

header = {k: _cast_header_value(v) for k, v in header.items()}
Expand Down
Loading

0 comments on commit 918fcf4

Please sign in to comment.