Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flatten columns, column renaming #16

Merged
merged 2 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bids2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Efficiently index large-scale BIDS datasets and derivatives
"""

from ._bids2table import bids2table, load_index # noqa
from ._bids2table import bids2table # noqa
from ._version import __version__, __version_tuple__ # noqa
8 changes: 4 additions & 4 deletions bids2table/_bids2table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from elbow.utils import setup_logging

from bids2table.extractors.bids import extract_bids_subdir
from bids2table.helpers import flat_to_multi_columns, load_index

setup_logging()

Expand Down Expand Up @@ -74,7 +73,7 @@ def bids2table(
if output.exists() and not stale:
if return_df:
logging.info("Loading cached index %s", output)
df = load_index(output)
df = pd.read_parquet(output)
else:
logging.info("Found cached index %s; nothing to do", output)
df = None
Expand All @@ -83,7 +82,6 @@ def bids2table(
if not persistent:
logging.info("Building index in memory")
df = build_table(source=source, extract=extract_bids_subdir)
df = flat_to_multi_columns(df)
return df

logging.info("Building persistent Parquet index")
Expand All @@ -95,6 +93,8 @@ def bids2table(
overwrite=overwrite,
workers=workers,
worker_id=worker_id,
path_column="file__file_path",
mtime_column="file__mod_time",
)
df = load_index(output) if return_df else None
df = pd.read_parquet(output) if return_df else None
return df
38 changes: 18 additions & 20 deletions bids2table/extractors/_inheritance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,57 +10,55 @@

def find_bids_parents(
query: Dict[str, str],
root: StrOrPath,
depth: Optional[int] = 4,
start: StrOrPath,
depth: Optional[int] = None,
) -> Generator[str, None, None]:
"""
Find all BIDS files satisfying the inheritance principle requirements for the given
``query`` entities dict. The ``query`` must contain at least one of ``'suffix'`` or
``'ext'``. Search up the directory hierarchy at most ``depth`` levels,
starting from and including ``root``. Yields matching ``path``s in decreasing
topological order.

The default depth of 4 is appropriate for directory structures of the form
``{dataset}/sub-{sub}/ses-{ses}/{datatype}``. If depth is None, search all the way
up the tree. Note also that the search stops once a ``dataset_description.json`` is
found.
`query` entities dict. The `query` must contain at least one of `'suffix'` or
`'ext'`. Search up the directory hierarchy at most `depth` levels, starting from and
including `start`, or until a `dataset_description.json` file is found, indicating
the BIDS dataset root directory. If `depth` is `None`, the search may continue to
the filesystem root.

Yields matching `path`s in decreasing topological order.
"""
suffix = query.get("suffix")
ext = query.get("ext")
if not (suffix or ext):
raise ValueError("At least one of 'suffix' or 'ext' are required in `query`.")
pattern = f"*{suffix}{ext}" if suffix else f"*{ext}"

root = Path(root).absolute()
if not root.is_dir():
root = root.parent
start = Path(start).absolute()
if not start.is_dir():
start = start.parent

if depth is None:
depth = len(root.parts)
depth = len(start.parts)

for _ in range(depth):
for path in _glob(root, pattern):
for path in _glob(start, pattern):
entities = parse_bids_entities(path)
if _test_bids_match(query, entities):
yield str(path)

# Stop climbing the directory if we find the description json, which should
# always be at the top-level dataset directory.
# TODO: for nested datasets, can you inherit beyond the first root? I hope not..
if is_dataset_root(root):
if is_dataset_root(start):
break

root = root.parent
start = start.parent


def find_first_bids_parent(
query: Dict[str, str], root: StrOrPath, depth: int = 4
query: Dict[str, str], start: StrOrPath, depth: Optional[int] = None
) -> Optional[str]:
"""
Find the first BIDS parent file matching the ``query`` entities dict. Returns
``None`` if no parents found. See :func:`find_bids_parents` for more details.
"""
return next(find_bids_parents(query, root, depth), None)
return next(find_bids_parents(query, start, depth), None)


@lru_cache()
Expand Down
19 changes: 6 additions & 13 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from elbow.record import Record, concat
from elbow.typing import StrOrPath

from .dataset import extract_dataset_meta
from .dataset import extract_dataset
from .entities import BIDSEntities
from .sidecar import extract_sidecar, is_associated_sidecar
from .metadata import extract_metadata, is_associated_sidecar


def extract_bids_file(path: StrOrPath) -> Optional[Record]:
Expand All @@ -27,18 +27,11 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
)
return None

dset_meta = extract_dataset_meta(path)
bids_meta = extract_sidecar(path)
file_meta = extract_file_meta(path)
dset_rec = extract_dataset(path)
meta_rec = extract_metadata(path)
file_rec = extract_file_meta(path)

rec = concat(
{
"dataset": dset_meta,
"entities": entities,
"metadata": bids_meta,
"file": file_meta,
}
)
rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec})
return rec


Expand Down
2 changes: 1 addition & 1 deletion bids2table/extractors/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from elbow.typing import StrOrPath


def extract_dataset_meta(path: StrOrPath) -> Record:
def extract_dataset(path: StrOrPath) -> Record:
"""
Get info about the BIDS dataset that ``path`` belongs to.
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,16 @@
from .entities import parse_bids_entities


def extract_sidecar(path: StrOrPath) -> Record:
def extract_metadata(path: StrOrPath) -> Record:
"""
Load the JSON sidecar metadata associated with ``path``. Supports metadata
inheritance by searching up the directory tree for matching JSON files.
"""
entities = parse_bids_entities(path)
query = dict(entities, ext=".json")
root = Path(path).parent

metadata = {}
sidecars = reversed(list(find_bids_parents(query, root=root)))
sidecars = reversed(list(find_bids_parents(query, start=Path(path).parent)))
for path in sidecars:
with open(path) as f:
try:
Expand All @@ -31,7 +30,7 @@ def extract_sidecar(path: StrOrPath) -> Record:
)

# TODO: type aliases for json, pickle, etc so we can use a dataclass here.
rec = Record({"sidecar": metadata or None}, types={"sidecar": "json"})
rec = Record({"json": metadata or None}, types={"json": "json"})
return rec


Expand Down
13 changes: 0 additions & 13 deletions bids2table/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Any, Dict, Optional, Union

import pandas as pd
from elbow.typing import StrOrPath

from bids2table.extractors.entities import BIDSEntities

Expand Down Expand Up @@ -70,15 +69,3 @@ def multi_to_flat_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame:
df = df.copy(deep=False)
df.columns = pd.Index(join_columns)
return df


def load_index(
path: StrOrPath, split_columns: bool = True, sep: str = "__"
) -> pd.DataFrame:
"""
Load a bids2table index, optionally splitting columns into a multi index on `sep`.
"""
df = pd.read_parquet(path)
if split_columns:
df = flat_to_multi_columns(df, sep=sep)
return df
Loading