Skip to content

Commit

Permalink
Make extracting sidecar metadata optional.
Browse files Browse the repository at this point in the history
In datasets with large sidecar json metadata, extracting metadata can
take up >90% of run time.
  • Loading branch information
clane9 committed May 1, 2024
1 parent bf7c302 commit c59813d
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 12 deletions.
11 changes: 9 additions & 2 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,18 @@ def main():
"Incompatible with --overwrite. (default: None)",
default=None,
)
parser.add_argument("--verbose", "-v", help="Verbose logging.", action="store_true")
parser.add_argument(
"--verbose",
"-v",
action="count",
default=0,
help="Increase verbosity level.",
)

args = parser.parse_args()

setup_logging(level="INFO" if args.verbose else "WARNING")
log_level = ["ERROR", "WARNING", "INFO"][min(args.verbose, 2)]
setup_logging(level=log_level)

bids2table(
root=args.root,
Expand Down
9 changes: 7 additions & 2 deletions bids2table/_b2t.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from functools import partial
from pathlib import Path
from typing import Optional

Expand All @@ -15,6 +16,7 @@
def bids2table(
root: StrOrPath,
*,
with_meta: bool = True,
persistent: bool = False,
index_path: Optional[StrOrPath] = None,
incremental: bool = False,
Expand All @@ -28,6 +30,8 @@ def bids2table(
Args:
root: path to BIDS dataset
with_meta: extract JSON sidecar metadata. Excluding metadata can result in much
faster indexing.
persistent: whether to save index to disk as a Parquet dataset
index_path: path to BIDS Parquet index to generate or load. Defaults to `root /
"index.b2t"`. Index generation requires `persistent=True`.
Expand Down Expand Up @@ -60,6 +64,7 @@ def bids2table(
dirs_only=True,
follow_links=True,
)
extract = partial(extract_bids_subdir, with_meta=with_meta)

if index_path is None:
index_path = root / "index.b2t"
Expand All @@ -80,7 +85,7 @@ def bids2table(
logger.info("Building index in memory")
df = build_table(
source=source,
extract=extract_bids_subdir,
extract=extract,
workers=workers,
worker_id=worker_id,
)
Expand All @@ -90,7 +95,7 @@ def bids2table(
logger.info("Building persistent Parquet index")
build_parquet(
source=source,
extract=extract_bids_subdir,
extract=extract,
output=index_path,
incremental=incremental,
overwrite=overwrite,
Expand Down
13 changes: 9 additions & 4 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
logger = logging.getLogger(__name__)


def extract_bids_file(path: StrOrPath) -> Optional[Record]:
def extract_bids_file(path: StrOrPath, with_meta: bool = True) -> Optional[Record]:
"""
Extract BIDS entities and metadata from a data file in a BIDS dataset.
"""
Expand All @@ -31,19 +31,24 @@ def extract_bids_file(path: StrOrPath) -> Optional[Record]:
return None

dset_rec = extract_dataset(path)
meta_rec = extract_metadata(path)
if with_meta:
meta_rec = extract_metadata(path)
else:
meta_rec = Record({"json": None}, types={"json": "json"})
file_rec = extract_file_meta(path)

rec = concat({"ds": dset_rec, "ent": entities, "meta": meta_rec, "finfo": file_rec})
return rec


def extract_bids_subdir(path: StrOrPath) -> Generator[Optional[Record], None, None]:
def extract_bids_subdir(
path: StrOrPath, with_meta: bool = True
) -> Generator[Optional[Record], None, None]:
"""
Extract BIDS records recursively for all files in a sub-directory.
"""
for path in iglob(str(Path(path) / "**"), recursive=True):
yield extract_bids_file(path)
yield extract_bids_file(path, with_meta=with_meta)


def is_bids_file(path: StrOrPath) -> bool:
Expand Down
17 changes: 13 additions & 4 deletions tests/test_bids2table.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,25 @@ def empty_dataset(tmp_path: Path) -> Path:
return root


@pytest.mark.parametrize("persistent", [False, True])
def test_bids2table(tmp_path: Path, persistent: bool):
@pytest.mark.parametrize(
"persistent,with_meta", [(False, True), (True, True), (False, False)]
)
def test_bids2table(tmp_path: Path, persistent: bool, with_meta: bool):
root = BIDS_EXAMPLES / "ds001"
index_path = tmp_path / "index.b2t"

tab = bids2table(root=root, persistent=persistent, index_path=index_path)
tab = bids2table(
root=root, with_meta=with_meta, persistent=persistent, index_path=index_path
)
assert tab.shape == (128, 40)

if not with_meta:
assert tab.loc[0, "meta__json"] is None

# Reload from cache
tab2 = bids2table(root=root, persistent=persistent, index_path=index_path)
tab2 = bids2table(
root=root, with_meta=with_meta, persistent=persistent, index_path=index_path
)
assert tab.equals(tab2)


Expand Down

0 comments on commit c59813d

Please sign in to comment.