Skip to content

Commit

Permalink
Make extracting sidecar metadata optional (#33)
Browse files Browse the repository at this point in the history
* Make extracting sidecar metadata optional.

In datasets with large sidecar json metadata, extracting metadata can
take up >90% of run time.

* Add option to extract sidecar metadata after table
  • Loading branch information
clane9 authored May 1, 2024
1 parent bf7c302 commit b22f036
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 36 deletions.
11 changes: 9 additions & 2 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,18 @@ def main():
"Incompatible with --overwrite. (default: None)",
default=None,
)
parser.add_argument("--verbose", "-v", help="Verbose logging.", action="store_true")
parser.add_argument(
"--verbose",
"-v",
action="count",
default=0,
help="Increase verbosity level.",
)

args = parser.parse_args()

setup_logging(level="INFO" if args.verbose else "WARNING")
log_level = ["ERROR", "WARNING", "INFO"][min(args.verbose, 2)]
setup_logging(level=log_level)

bids2table(
root=args.root,
Expand Down
9 changes: 7 additions & 2 deletions bids2table/_b2t.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from functools import partial
from pathlib import Path
from typing import Optional

Expand All @@ -15,6 +16,7 @@
def bids2table(
root: StrOrPath,
*,
with_meta: bool = True,
persistent: bool = False,
index_path: Optional[StrOrPath] = None,
incremental: bool = False,
Expand All @@ -28,6 +30,8 @@ def bids2table(
Args:
root: path to BIDS dataset
with_meta: extract JSON sidecar metadata. Excluding metadata can result in much
faster indexing.
persistent: whether to save index to disk as a Parquet dataset
index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
"index.b2t"`. Index generation requires `persistent=True`.
Expand Down Expand Up @@ -60,6 +64,7 @@ def bids2table(
dirs_only=True,
follow_links=True,
)
extract = partial(extract_bids_subdir, with_meta=with_meta)

if index_path is None:
index_path = root / "index.b2t"
Expand All @@ -80,7 +85,7 @@ def bids2table(
logger.info("Building index in memory")
df = build_table(
source=source,
extract=extract_bids_subdir,
extract=extract,
workers=workers,
worker_id=worker_id,
)
Expand All @@ -90,7 +95,7 @@ def bids2table(
logger.info("Building persistent Parquet index")
build_parquet(
source=source,
extract=extract_bids_subdir,
extract=extract,
output=index_path,
incremental=incremental,
overwrite=overwrite,
Expand Down
13 changes: 9 additions & 4 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
logger = logging.getLogger(__name__)


def extract_bids_file(path: StrOrPath) -> Optional[Record]:
def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Record]:
"""
Extract BIDS entities and metadata from a data file in a BIDS dataset.
"""
Expand All @@ -31,19 +31,24 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
return None

dset_rec = extract_dataset(path)
meta_rec = extract_metadata(path)
if with_meta:
meta_rec = extract_metadata(path)
else:
meta_rec = Record({"json": None}, types={"json": "json"})
file_rec = extract_file_meta(path)

rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
return rec


def extract_bids_subdir(path: StrOrPath) -> Generator[Optional[Record], None, None]:
def extract_bids_subdir(
path: StrOrPath, with_meta: bool = True
) -> Generator[Optional[Record], None, None]:
"""
Extract BIDS records recursively for all files in a sub-directory.
"""
for path in iglob(str(Path(path) / "**"), recursive=True):
yield extract_bids_file(path)
yield extract_bids_file(path, with_meta=with_meta)


def is_bids_file(path: StrOrPath) -> bool:
Expand Down
11 changes: 11 additions & 0 deletions bids2table/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd

from bids2table.entities import ENTITY_NAMES_TO_KEYS, BIDSEntities
from bids2table.extractors.metadata import extract_metadata


class BIDSTable(pd.DataFrame):
Expand Down Expand Up @@ -269,6 +270,16 @@ def add_prefix(k: str):
return self
return out

def with_meta(self, inplace: bool = False) -> "BIDSTable":
"""
Returns a new BIDS table complete with JSON sidecar metadata.
"""
out = self if inplace else self.copy()
file_paths = out.finfo["file_path"]
meta_json = file_paths.apply(lambda path: extract_metadata(path)["json"])
out.loc[:, "meta__json"] = meta_json
return out

@classmethod
def from_df(cls, df: pd.DataFrame) -> "BIDSTable":
"""
Expand Down
Loading

0 comments on commit b22f036

Please sign in to comment.