childmindresearch · clane9 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -2,5 +2,5 @@
 Efficiently index large-scale BIDS datasets and derivatives
 """
 
-from ._bids2table import bids2table, load_index  # noqa
+from ._bids2table import bids2table  # noqa
 from ._version import __version__, __version_tuple__  # noqa
diff --git a/bids2table/_bids2table.py b/bids2table/_bids2table.py
@@ -9,7 +9,6 @@
 from elbow.utils import setup_logging
 
 from bids2table.extractors.bids import extract_bids_subdir
-from bids2table.helpers import flat_to_multi_columns, load_index
 
 setup_logging()
 
@@ -74,7 +73,7 @@ def bids2table(
     if output.exists() and not stale:
         if return_df:
             logging.info("Loading cached index %s", output)
-            df = load_index(output)
+            df = pd.read_parquet(output)
         else:
             logging.info("Found cached index %s; nothing to do", output)
             df = None
@@ -83,7 +82,6 @@ def bids2table(
     if not persistent:
         logging.info("Building index in memory")
         df = build_table(source=source, extract=extract_bids_subdir)
-        df = flat_to_multi_columns(df)
         return df
 
     logging.info("Building persistent Parquet index")
@@ -95,6 +93,8 @@ def bids2table(
         overwrite=overwrite,
         workers=workers,
         worker_id=worker_id,
+        path_column="file__file_path",
+        mtime_column="file__mod_time",
     )
-    df = load_index(output) if return_df else None
+    df = pd.read_parquet(output) if return_df else None
     return df
diff --git a/bids2table/extractors/_inheritance.py b/bids2table/extractors/_inheritance.py
@@ -10,57 +10,55 @@
 
 def find_bids_parents(
     query: Dict[str, str],
-    root: StrOrPath,
-    depth: Optional[int] = 4,
+    start: StrOrPath,
+    depth: Optional[int] = None,
 ) -> Generator[str, None, None]:
     """
     Find all BIDS files satisfying the inheritance principle requirements for the given
-    ``query`` entities dict. The ``query`` must contain at least one of ``'suffix'`` or
-    ``'ext'``. Search up the directory hierarchy at most ``depth`` levels,
-    starting from and including ``root``. Yields matching ``path``s in decreasing
-    topological order.
-
-    The default depth of 4 is appropriate for directory structures of the form
-    ``{dataset}/sub-{sub}/ses-{ses}/{datatype}``. If depth is None, search all the way
-    up the tree. Note also that the search stops once a ``dataset_description.json`` is
-    found.
+    `query` entities dict. The `query` must contain at least one of `'suffix'` or
+    `'ext'`. Search up the directory hierarchy at most `depth` levels, starting from and
+    including `start`, or until a `dataset_description.json` file is found, indicating
+    the BIDS dataset root directory. If `depth` is `None`, the search may continue to
+    the filesystem root.
+
+    Yields matching `path`s in decreasing topological order.
     """
     suffix = query.get("suffix")
     ext = query.get("ext")
     if not (suffix or ext):
         raise ValueError("At least one of 'suffix' or 'ext' are required in `query`.")
     pattern = f"*{suffix}{ext}" if suffix else f"*{ext}"
 
-    root = Path(root).absolute()
-    if not root.is_dir():
-        root = root.parent
+    start = Path(start).absolute()
+    if not start.is_dir():
+        start = start.parent
 
     if depth is None:
-        depth = len(root.parts)
+        depth = len(start.parts)
 
     for _ in range(depth):
-        for path in _glob(root, pattern):
+        for path in _glob(start, pattern):
             entities = parse_bids_entities(path)
             if _test_bids_match(query, entities):
                 yield str(path)
 
         # Stop climbing the directory if we find the description json, which should
         # always be at the top-level dataset directory.
         # TODO: for nested datasets, can you inherit beyond the first root? I hope not..
-        if is_dataset_root(root):
+        if is_dataset_root(start):
             break
 
-        root = root.parent
+        start = start.parent
 
 
 def find_first_bids_parent(
-    query: Dict[str, str], root: StrOrPath, depth: int = 4
+    query: Dict[str, str], start: StrOrPath, depth: Optional[int] = None
 ) -> Optional[str]:
     """
     Find the first BIDS parent file matching the ``query`` entities dict. Returns
     ``None`` if no parents found. See :func:`find_bids_parents` for more details.
     """
-    return next(find_bids_parents(query, root, depth), None)
+    return next(find_bids_parents(query, start, depth), None)
 
 
 @lru_cache()

diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py
@@ -7,9 +7,9 @@
 from elbow.record import Record, concat
 from elbow.typing import StrOrPath
 
-from .dataset import extract_dataset_meta
+from .dataset import extract_dataset
 from .entities import BIDSEntities
-from .sidecar import extract_sidecar, is_associated_sidecar
+from .metadata import extract_metadata, is_associated_sidecar
 
 
 def extract_bids_file(path: StrOrPath) -> Optional[Record]:
@@ -27,18 +27,11 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
         )
         return None
 
-    dset_meta = extract_dataset_meta(path)
-    bids_meta = extract_sidecar(path)
-    file_meta = extract_file_meta(path)
+    dset_rec = extract_dataset(path)
+    meta_rec = extract_metadata(path)
+    file_rec = extract_file_meta(path)
 
-    rec = concat(
-        {
-            "dataset": dset_meta,
-            "entities": entities,
-            "metadata": bids_meta,
-            "file": file_meta,
-        }
-    )
+    rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "file": file_rec})
     return rec
 
 

diff --git a/bids2table/extractors/dataset.py b/bids2table/extractors/dataset.py
@@ -8,7 +8,7 @@
 from elbow.typing import StrOrPath
 
 
-def extract_dataset_meta(path: StrOrPath) -> Record:
+def extract_dataset(path: StrOrPath) -> Record:
     """
     Get info about the BIDS dataset that ``path`` belongs to.
     """

diff --git a/bids2table/extractors/sidecar.py → bids2table/extractors/metadata.py b/bids2table/extractors/sidecar.py → bids2table/extractors/metadata.py
@@ -10,17 +10,16 @@
 from .entities import parse_bids_entities
 
 
-def extract_sidecar(path: StrOrPath) -> Record:
+def extract_metadata(path: StrOrPath) -> Record:
     """
     Load the JSON sidecar metadata associated with ``path``. Supports metadata
     inheritance by searching up the directory tree for matching JSON files.
     """
     entities = parse_bids_entities(path)
     query = dict(entities, ext=".json")
-    root = Path(path).parent
 
     metadata = {}
-    sidecars = reversed(list(find_bids_parents(query, root=root)))
+    sidecars = reversed(list(find_bids_parents(query, start=Path(path).parent)))
     for path in sidecars:
         with open(path) as f:
             try:
@@ -31,7 +30,7 @@ def extract_sidecar(path: StrOrPath) -> Record:
                 )
 
     # TODO: type aliases for json, pickle, etc so we can use a dataclass here.
-    rec = Record({"sidecar": metadata or None}, types={"sidecar": "json"})
+    rec = Record({"json": metadata or None}, types={"json": "json"})
     return rec
 
 

diff --git a/bids2table/helpers.py b/bids2table/helpers.py
@@ -2,7 +2,6 @@
 from typing import Any, Dict, Optional, Union
 
 import pandas as pd
-from elbow.typing import StrOrPath
 
 from bids2table.extractors.entities import BIDSEntities
 
@@ -70,15 +69,3 @@ def multi_to_flat_columns(df: pd.DataFrame, sep: str = "__") -> pd.DataFrame:
     df = df.copy(deep=False)
     df.columns = pd.Index(join_columns)
     return df
-
-
-def load_index(
-    path: StrOrPath, split_columns: bool = True, sep: str = "__"
-) -> pd.DataFrame:
-    """
-    Load a bids2table index, optionally splitting columns into a multi index on `sep`.
-    """
-    df = pd.read_parquet(path)
-    if split_columns:
-        df = flat_to_multi_columns(df, sep=sep)
-    return df