Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Higher-level API #17

Merged
merged 22 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Set up Python 3.7
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.7"
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ htmlcov
.vscode/
.env

# Local scratch
# Local data and scratch
.scratch
example/bids-examples.b2t

# Local environment
.venv
Expand Down
18 changes: 16 additions & 2 deletions bids2table/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
"""
Efficiently index large-scale BIDS datasets and derivatives
Efficiently index and query large-scale BIDS datasets and derivatives.
"""

from ._bids2table import bids2table # noqa
# Register elbow extension types
import elbow.dtypes # noqa

from ._b2t import bids2table
from ._version import __version__, __version_tuple__ # noqa
from .entities import BIDSEntities, parse_bids_entities
from .table import BIDSFile, BIDSTable, join_bids_path

__all__ = [
"bids2table",
"BIDSTable",
"BIDSFile",
"BIDSEntities",
"parse_bids_entities",
"join_bids_path",
]
4 changes: 2 additions & 2 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ def main():
bids2table(
root=args.root,
persistent=True,
output=args.output,
index_path=args.output,
incremental=args.incremental,
overwrite=args.overwrite,
workers=args.workers,
worker_id=args.worker_id,
return_df=False,
return_table=False,
)


Expand Down
51 changes: 27 additions & 24 deletions bids2table/_bids2table.py → bids2table/_b2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from pathlib import Path
from typing import Optional

import pandas as pd
from elbow.builders import build_parquet, build_table
from elbow.sources.filesystem import Crawler
from elbow.typing import StrOrPath
from elbow.utils import setup_logging

from bids2table.extractors.bids import extract_bids_subdir
from bids2table.table import BIDSTable

setup_logging()

Expand All @@ -17,21 +17,21 @@ def bids2table(
root: StrOrPath,
*,
persistent: bool = False,
output: Optional[StrOrPath] = None,
index_path: Optional[StrOrPath] = None,
incremental: bool = False,
overwrite: bool = False,
workers: Optional[int] = None,
worker_id: Optional[int] = None,
return_df: bool = True,
) -> Optional[pd.DataFrame]:
return_table: bool = True,
) -> Optional[BIDSTable]:
"""
Index a BIDS dataset directory and load as a pandas DataFrame.

Args:
root: path to BIDS dataset
persistent: whether to save index to disk as a Parquet dataset
output: path to output Parquet dataset directory if `persistent` is
`True`. Defaults to `root / "index.b2t".
index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
"index.b2t"`. Index generation requires `persistent=True`.
incremental: update index incrementally with only new or changed files.
overwrite: overwrite previous index.
workers: number of parallel processes. If `None` or 1, run in the main
Expand All @@ -40,17 +40,19 @@ def bids2table(
worker_id: optional worker ID to use when scheduling parallel tasks externally.
Specifying the number of workers is required in this case. Incompatible with
overwrite.
return_df: whether to return the dataframe or just build the persistent index.
return_table: whether to return the BIDS table or just build the persistent
index.

Returns:
A DataFrame containing the BIDS Index.
A `BIDSTable` representing the indexed dataset(s), or `None` if `return_table`
is `False`.
"""
if worker_id is not None and not persistent:
raise ValueError(
"worker_id is only supported when generating a persistent index"
)
if not (return_df or persistent):
raise ValueError("persistent and return_df should not both be False")
if not (return_table or persistent):
raise ValueError("persistent and return_table should not both be False")

root = Path(root).expanduser().resolve()
if not root.is_dir():
Expand All @@ -64,37 +66,38 @@ def bids2table(
follow_links=True,
)

if output is None:
output = root / "index.b2t"
if index_path is None:
index_path = root / "index.b2t"
else:
output = Path(output).expanduser().resolve()
index_path = Path(index_path).expanduser().resolve()

stale = overwrite or incremental or worker_id is not None
if output.exists() and not stale:
if return_df:
logging.info("Loading cached index %s", output)
df = pd.read_parquet(output)
if index_path.exists() and not stale:
if return_table:
logging.info("Loading cached index %s", index_path)
tab = BIDSTable.from_parquet(index_path)
else:
logging.info("Found cached index %s; nothing to do", output)
df = None
return df
logging.info("Found cached index %s; nothing to do", index_path)
tab = None
return tab

if not persistent:
logging.info("Building index in memory")
df = build_table(source=source, extract=extract_bids_subdir)
return df
tab = BIDSTable.from_df(df)
return tab

logging.info("Building persistent Parquet index")
build_parquet(
source=source,
extract=extract_bids_subdir,
output=output,
output=index_path,
incremental=incremental,
overwrite=overwrite,
workers=workers,
worker_id=worker_id,
path_column="file__file_path",
mtime_column="file__mod_time",
)
df = pd.read_parquet(output) if return_df else None
return df
tab = BIDSTable.from_parquet(index_path) if return_table else None
return tab
109 changes: 74 additions & 35 deletions bids2table/extractors/entities.py → bids2table/entities.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""
A structured representation for BIDS entities.
"""

import re
import warnings
from dataclasses import asdict, dataclass, field, fields
from functools import lru_cache
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Optional, Union
from types import MappingProxyType
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

import pandas as pd
from elbow.typing import StrOrPath
Expand All @@ -26,6 +31,7 @@

def bids_field(
name: str,
display_name: str,
required: bool = False,
allowed_values: Optional[Iterable] = None,
default: Optional[Any] = None,
Expand All @@ -35,9 +41,13 @@ def bids_field(
BIDS entity dataclass field.
"""
if allowed_values is not None:
allowed_values = set(allowed_values)
allowed_values = list(allowed_values)

metadata = dict(name=name, allowed_values=allowed_values)
metadata = {
"name": name,
"display_name": display_name,
"allowed_values": allowed_values,
}
if required:
fld = field(metadata=metadata)
elif default_factory is not None:
Expand All @@ -60,48 +70,72 @@ class BIDSEntities:
https://bids-specification.readthedocs.io/en/stable/appendices/entities.html
"""

sub: str = bids_field(name="Subject", required=True)
ses: Optional[str] = bids_field(name="Session")
sample: Optional[str] = bids_field(name="Sample")
task: Optional[str] = bids_field(name="Task")
acq: Optional[str] = bids_field(name="Acquisition")
ce: Optional[str] = bids_field(name="Contrast Enhancing Agent")
trc: Optional[str] = bids_field(name="Tracer")
stain: Optional[str] = bids_field(name="Stain")
rec: Optional[str] = bids_field(name="Reconstruction")
dir: Optional[str] = bids_field(name="Phase-Encoding Direction")
run: Optional[int] = bids_field(name="Run")
mod: Optional[str] = bids_field(name="Corresponding Modality")
echo: Optional[int] = bids_field(name="Echo")
flip: Optional[int] = bids_field(name="Flip Angle")
inv: Optional[int] = bids_field(name="Inversion Time")
sub: str = bids_field(name="subject", display_name="Subject", required=True)
ses: Optional[str] = bids_field(name="session", display_name="Session")
sample: Optional[str] = bids_field(name="sample", display_name="Sample")
task: Optional[str] = bids_field(name="task", display_name="Task")
acq: Optional[str] = bids_field(name="acquisition", display_name="Acquisition")
ce: Optional[str] = bids_field(
name="ceagent", display_name="Contrast Enhancing Agent"
)
trc: Optional[str] = bids_field(name="tracer", display_name="Tracer")
stain: Optional[str] = bids_field(name="stain", display_name="Stain")
rec: Optional[str] = bids_field(
name="reconstruction", display_name="Reconstruction"
)
dir: Optional[str] = bids_field(
name="direction", display_name="Phase-Encoding Direction"
)
run: Optional[int] = bids_field(name="run", display_name="Run")
mod: Optional[str] = bids_field(
name="modality", display_name="Corresponding Modality"
)
echo: Optional[int] = bids_field(name="echo", display_name="Echo")
flip: Optional[int] = bids_field(name="flip", display_name="Flip Angle")
inv: Optional[int] = bids_field(name="inversion", display_name="Inversion Time")
mt: Optional[str] = bids_field(
name="Magnetization Transfer", allowed_values={"on", "off"}
name="mtransfer",
display_name="Magnetization Transfer",
allowed_values={"on", "off"},
)
part: Optional[str] = bids_field(
name="Part", allowed_values={"mag", "phase", "real", "imag"}
name="part",
display_name="Part",
allowed_values={"mag", "phase", "real", "imag"},
)
proc: Optional[str] = bids_field(
name="processing", display_name="Processed (on device)"
)
hemi: Optional[str] = bids_field(
name="hemisphere", display_name="Hemisphere", allowed_values={"L", "R"}
)
proc: Optional[str] = bids_field(name="Processed (on device)")
hemi: Optional[str] = bids_field(name="Hemisphere", allowed_values={"L", "R"})
space: Optional[str] = bids_field(name="Space")
split: Optional[int] = bids_field(name="Split")
recording: Optional[str] = bids_field(name="Recording")
chunk: Optional[int] = bids_field(name="Chunk")
atlas: Optional[str] = bids_field(name="Atlas")
res: Optional[str] = bids_field(name="Resolution")
den: Optional[str] = bids_field(name="Density")
label: Optional[str] = bids_field(name="Label")
desc: Optional[str] = bids_field(name="Description")
space: Optional[str] = bids_field(name="space", display_name="Space")
split: Optional[int] = bids_field(name="split", display_name="Split")
recording: Optional[str] = bids_field(name="recording", display_name="Recording")
chunk: Optional[int] = bids_field(name="chunk", display_name="Chunk")
atlas: Optional[str] = bids_field(name="atlas", display_name="Atlas")
res: Optional[str] = bids_field(name="resolution", display_name="Resolution")
den: Optional[str] = bids_field(name="density", display_name="Density")
label: Optional[str] = bids_field(name="label", display_name="Label")
desc: Optional[str] = bids_field(name="description", display_name="Description")
datatype: Optional[str] = bids_field(
name="Data type", allowed_values=BIDS_DATATYPES
name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES
)
suffix: Optional[str] = bids_field(name="Suffix")
ext: Optional[str] = bids_field(name="Extension")
suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix")
ext: Optional[str] = bids_field(name="extension", display_name="Extension")
extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field(
name="Extra entities",
name="extra_entities",
display_name="Extra entities",
default_factory=dict,
)

@staticmethod
def special() -> List[str]:
"""
Get list of field keys which are not standard entities.
"""
return ["datatype", "suffix", "ext", "extra_entities"]

@classmethod
def from_dict(cls, entities: Dict[str, Any], valid_only: bool = False):
"""
Expand Down Expand Up @@ -309,3 +343,8 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
if v is not None:
entities[k] = v
return entities


ENTITY_NAMES_TO_KEYS = MappingProxyType(
{f.metadata["name"]: f.name for f in fields(BIDSEntities)}
)
3 changes: 3 additions & 0 deletions bids2table/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
[Elbow](https://github.com/cmi-dair/elbow) extract functions for BIDS datasets.
"""
5 changes: 3 additions & 2 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from elbow.record import Record, concat
from elbow.typing import StrOrPath

from bids2table.entities import BIDSEntities

from .dataset import extract_dataset
from .entities import BIDSEntities
from .metadata import extract_metadata, is_associated_sidecar


Expand All @@ -31,7 +32,7 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
meta_rec = extract_metadata(path)
file_rec = extract_file_meta(path)

rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec})
rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
return rec


Expand Down
12 changes: 10 additions & 2 deletions bids2table/extractors/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from elbow.typing import StrOrPath
from nibabel.filebasedimages import ImageFileError

from .entities import parse_bids_entities
from bids2table.entities import parse_bids_entities

try:
import nifti
Expand Down Expand Up @@ -41,6 +41,9 @@ def extract_image_meta(path: StrOrPath, *, backend: str = "nibabel") -> Record:
def _read_image_meta(
path: str, backend: str = "nibabel"
) -> Tuple[Dict[str, Any], np.ndarray]:
header: Dict[str, Any]
affine: np.ndarray

if backend == "nifti":
if not has_nifti:
raise ModuleNotFoundError("nifti image backend not installed")
Expand All @@ -51,7 +54,12 @@ def _read_image_meta(
affine = None
else:
img = nib.load(path)
header = dict(img.header)
if not isinstance(img, nib.Nifti1Image):
raise TypeError(
f"Foung image type {type(img).__name__}; only Nifti1Image supported"
)

header = {k: v for k, v in img.header.items()}
affine = np.asarray(img.affine)

header = {k: _cast_header_value(v) for k, v in header.items()}
Expand Down
Loading