Skip to content

Commit

Permalink
Merge pull request #363 from candleindark/dandi-files-extractor
Browse files Browse the repository at this point in the history
Implement `dandi:files` extractor
  • Loading branch information
yarikoptic authored Aug 20, 2024
2 parents f4aed5d + 47074f2 commit 005a2b7
Show file tree
Hide file tree
Showing 11 changed files with 111 additions and 24 deletions.
2 changes: 1 addition & 1 deletion datalad_registry/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class BaseConfig(OperationConfig):
"bids_dataset",
# === DANDI related extractors ===
"dandi",
# "dandi:files", # Let's not activate this yet by default
# "dandi:files", # Not enabled for usual setups because of performance impact
]

# === worker, Celery, related configuration ===
Expand Down
33 changes: 32 additions & 1 deletion datalad_registry/tasks/utils/builtin_meta_extractors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# This file specifies custom metadata extractors, for datalad_registry, and related
# definitions.
from collections.abc import Callable
import json

from datalad.distribution.dataset import require_dataset
from yaml import load as yaml_load
Expand Down Expand Up @@ -81,8 +82,38 @@ def dlreg_dandi_files_meta_extract(url: RepoUrl) -> URLMetadata:
:return: A `URLMetadata` object containing the extracted metadata ready
:raises FileNotFoundError: If the `.dandi/assets.json` file is not found
at the dataset
Note: This function is meant to be called inside a Celery task for it requires
an active application context of the Flask app
Note: This function must be called with a RepoUrl object with a cache path, i.e.,
one that must have been processed already.
"""
raise NotImplementedError
name = "dandi:files" # Name of this extractor
version = "0.0.1" # Version of this extractor

assert url.cache_path_abs is not None, (
f"Encountered a RepoUrl with no cache path, "
f"with a processed flag set to {url.processed}"
)

with open(url.cache_path_abs / ".dandi/assets.json") as f:
extracted_metadata = json.load(f)

ds = require_dataset(
url.cache_path_abs,
check_installed=True,
purpose="dandiset files metadata extraction",
)

return URLMetadata(
dataset_describe=get_head_describe(ds),
dataset_version=ds.repo.get_hexsha(),
extractor_name=name,
extractor_version=version,
extraction_parameter={},
extracted_metadata=extracted_metadata,
url=url,
)


# A mapping from the names of the supported extractors to the functions
Expand Down
10 changes: 3 additions & 7 deletions datalad_registry/tests/test__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,9 @@ def test_configuration(
},
}

default_metadata_extractors = [
"metalad_core",
"metalad_studyminimeta",
"datacite_gin",
"bids_dataset",
"dandi",
]
default_metadata_extractors = BaseConfig.__fields__[
"DATALAD_REGISTRY_METADATA_EXTRACTORS"
].default

def mock_compile_config_from_env(*_args, **_kwargs):
# noinspection PyTypeChecker
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,31 @@ def test_no_document(self, dandi_repo_url_with_up_to_date_clone, flask_app):
dlreg_dandi_meta_extract(repo_url)


class TestDlregDandiFilesMetaExtract:
def test_valid_input(self, dandi_repo_url_with_up_to_date_clone, flask_app):
"""
Test the case that the argument `url` is a valid `RepoUrl` object with a
valid corresponding dandi dataset in the local cache
"""
from datalad_registry.tasks.utils.builtin_meta_extractors import (
dlreg_dandi_files_meta_extract,
)

repo_url = dandi_repo_url_with_up_to_date_clone[0]
ds_clone = dandi_repo_url_with_up_to_date_clone[2]

with flask_app.app_context():
url_metadata = dlreg_dandi_files_meta_extract(repo_url)

assert url_metadata.dataset_describe == get_head_describe(ds_clone)
assert url_metadata.dataset_version == ds_clone.repo.get_hexsha()
assert url_metadata.extractor_name == "dandi:files"
assert url_metadata.extractor_version == "0.0.1"
assert url_metadata.extraction_parameter == {}
assert url_metadata.extracted_metadata == [{"asset_id": "123"}]
assert url_metadata.url == repo_url


class TestDlregMetaExtract:
def test_unsupported_extractor(
self, dandi_repo_url_with_up_to_date_clone, flask_app
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.read-only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
read-only-db:
condition: service_healthy
ports:
- "${READ_ONLY_WEB_PUBLISH_PORT}:5000"
- "${WEB_PORT_AT_HOST}:5000"
environment:
FLASK_APP: "datalad_registry:create_app"

Expand Down
8 changes: 4 additions & 4 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ services:
RABBITMQ_DEFAULT_USER: "${RABBITMQ_DEFAULT_USER}"
RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}"
ports:
- "127.0.0.1:35672:5672"
- "127.0.0.1:45672:15672"
- "127.0.0.1:${BROKER_PORT_AT_HOST}:5672"
- "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand All @@ -23,7 +23,7 @@ services:
backend:
image: docker.io/redis:7
ports:
- "127.0.0.1:36379:6379"
- "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379"

db:
image: docker.io/postgres:15
Expand All @@ -33,7 +33,7 @@ services:
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
POSTGRES_INITDB_ARGS: --encoding utf8 --locale C
ports:
- "127.0.0.1:35432:5432"
- "127.0.0.1:${DB_PORT_AT_HOST}:5432"
healthcheck:
test: [ "CMD", "pg_isready", "-U", "${POSTGRES_USER}", "-d", "${POSTGRES_DB}", "-q" ]
interval: 30s
Expand Down
12 changes: 6 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
db:
condition: service_healthy
ports:
- "5000:5000"
- "${WEB_PORT_AT_HOST}:5000"
environment: &env
FLASK_APP: "datalad_registry:create_app"

Expand Down Expand Up @@ -89,7 +89,7 @@ services:
FLOWER_NATURAL_TIME: "True"
FLOWER_BASIC_AUTH: "$FLOWER_BASIC_AUTH"
ports:
- "127.0.0.1:5555:5555"
- "127.0.0.1:${MONITOR_PORT_AT_HOST}:5555"
command: [ "/sbin/my_init", "--", "celery", "-A", "datalad_registry.make_celery:celery_app", "flower" ]
volumes:
- ${MONITOR_PATH_AT_HOST}/data:/data
Expand All @@ -108,8 +108,8 @@ services:
RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}"
RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: "-rabbit consumer_timeout 43200000" # 12 hours in milliseconds
ports:
- "127.0.0.1:5672:5672"
- "127.0.0.1:15672:15672"
- "127.0.0.1:${BROKER_PORT_AT_HOST}:5672"
- "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand All @@ -128,7 +128,7 @@ services:
backend:
image: docker.io/redis:7
ports:
- "127.0.0.1:6379:6379"
- "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379"

db:
image: docker.io/postgres:15
Expand All @@ -138,7 +138,7 @@ services:
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
POSTGRES_INITDB_ARGS: --encoding utf8 --locale C
ports:
- "5432:5432"
- "${DB_PORT_AT_HOST}:5432"
userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly
# See https://github.com/containers/podman-compose/issues/166
# for details.
Expand Down
6 changes: 6 additions & 0 deletions env.test
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
# within the same host.
COMPOSE_PROJECT_NAME=dl-registry-test

# Ports of the services used for testing at host
BROKER_PORT_AT_HOST=35672
BROKER_MANAGEMENT_PORT_AT_HOST=45672
BACKEND_PORT_AT_HOST=36379
DB_PORT_AT_HOST=35432

# Variables related to the broker service
RABBITMQ_DEFAULT_USER=tester
RABBITMQ_DEFAULT_PASS=testpass
Expand Down
11 changes: 11 additions & 0 deletions template.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# It is to be copied to a target file named `.env.dev` or `.env.prod`, and the target
# file is to be modified (changing usernames, passwords, etc.).

# The name of the Docker Compose project (stack)
COMPOSE_PROJECT_NAME=datalad-registry

# Bind mount paths at host
WEB_PATH_AT_HOST=./services/web
WORKER_PATH_AT_HOST=./services/worker
Expand All @@ -10,6 +13,14 @@ MONITOR_PATH_AT_HOST=./services/monitor
BROKER_PATH_AT_HOST=./services/broker
DB_PATH_AT_HOST=./services/db

# Ports of the services at host
WEB_PORT_AT_HOST=5000
MONITOR_PORT_AT_HOST=5555
BROKER_PORT_AT_HOST=5672
BROKER_MANAGEMENT_PORT_AT_HOST=15672
BACKEND_PORT_AT_HOST=6379
DB_PORT_AT_HOST=5432

DATALAD_REGISTRY_OPERATION_MODE=DEVELOPMENT # or PRODUCTION

# Variables related to the broker service
Expand Down
7 changes: 3 additions & 4 deletions template.env.read-only
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,13 @@ COMPOSE_PROJECT_NAME=dl-registry-read-only
WEB_PATH_AT_HOST=./services/read-only-web
DB_PATH_AT_HOST=./services/read-only-db

# Ports of the service(s) at host
WEB_PORT_AT_HOST=5000

# Variables related to the db service
POSTGRES_DB=pgdb
POSTGRES_USER=pguser
POSTGRES_PASSWORD=pgpass
# (Make sure that user name and password characters do not need to be escaped for URL format
# or to escape them properly if they do)
SQLALCHEMY_DATABASE_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@read-only-db:5432/${POSTGRES_DB}"


# The port on the host to which the read-only-web service will be bound
READ_ONLY_WEB_PUBLISH_PORT=5000
19 changes: 19 additions & 0 deletions tools/run_dandi_files_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This script initiates Celery tasks to run the `dandi:files` on each processed repo.

from sqlalchemy import select

from datalad_registry import create_app
from datalad_registry.models import RepoUrl, db
from datalad_registry.tasks import extract_ds_meta

flask_app = create_app()

with flask_app.app_context():

# Get the IDs of the processed repo URLs
processed_url_ids = (
db.session.execute(select(RepoUrl.id).filter(RepoUrl.processed)).scalars().all()
)

for url_id in processed_url_ids:
extract_ds_meta.delay(url_id, "dandi:files")

0 comments on commit 005a2b7

Please sign in to comment.