diff --git a/datalad_registry/conf.py b/datalad_registry/conf.py index 5f8e3394..c158b62e 100644 --- a/datalad_registry/conf.py +++ b/datalad_registry/conf.py @@ -53,7 +53,7 @@ class BaseConfig(OperationConfig): "bids_dataset", # === DANDI related extractors === "dandi", - # "dandi:files", # Let's not activate this yet by default + # "dandi:files", # Not enabled for usual setups because of performance impact ] # === worker, Celery, related configuration === diff --git a/datalad_registry/tasks/utils/builtin_meta_extractors.py b/datalad_registry/tasks/utils/builtin_meta_extractors.py index 7acbb949..572d1c74 100644 --- a/datalad_registry/tasks/utils/builtin_meta_extractors.py +++ b/datalad_registry/tasks/utils/builtin_meta_extractors.py @@ -1,6 +1,7 @@ # This file specifies custom metadata extractors, for datalad_registry, and related # definitions. from collections.abc import Callable +import json from datalad.distribution.dataset import require_dataset from yaml import load as yaml_load @@ -81,8 +82,38 @@ def dlreg_dandi_files_meta_extract(url: RepoUrl) -> URLMetadata: :return: A `URLMetadata` object containing the extracted metadata ready :raises FileNotFoundError: If the `.dandi/assets.json` file is not found at the dataset + + Note: This function is meant to be called inside a Celery task for it requires + an active application context of the Flask app + Note: This function must be called with a RepoUrl object with a cache path, i.e., + one that must have been processed already. """ - raise NotImplementedError + name = "dandi:files" # Name of this extractor + version = "0.0.1" # Version of this extractor + + assert url.cache_path_abs is not None, ( + f"Encountered a RepoUrl with no cache path, " + f"with a processed flag set to {url.processed}" + ) + + with open(url.cache_path_abs / ".dandi/assets.json") as f: + extracted_metadata = json.load(f) + + ds = require_dataset( + url.cache_path_abs, + check_installed=True, + purpose="dandiset files metadata extraction", + ) + + return URLMetadata( + dataset_describe=get_head_describe(ds), + dataset_version=ds.repo.get_hexsha(), + extractor_name=name, + extractor_version=version, + extraction_parameter={}, + extracted_metadata=extracted_metadata, + url=url, + ) # A mapping from the names of the supported extractors to the functions diff --git a/datalad_registry/tests/test__init__.py b/datalad_registry/tests/test__init__.py index ea449ea8..a9c19cc3 100644 --- a/datalad_registry/tests/test__init__.py +++ b/datalad_registry/tests/test__init__.py @@ -82,13 +82,9 @@ def test_configuration( }, } - default_metadata_extractors = [ - "metalad_core", - "metalad_studyminimeta", - "datacite_gin", - "bids_dataset", - "dandi", - ] + default_metadata_extractors = BaseConfig.__fields__[ + "DATALAD_REGISTRY_METADATA_EXTRACTORS" + ].default def mock_compile_config_from_env(*_args, **_kwargs): # noinspection PyTypeChecker diff --git a/datalad_registry/tests/test_tasks/test_utils/test_builtin_meta_extractors.py b/datalad_registry/tests/test_tasks/test_utils/test_builtin_meta_extractors.py index 4128c28a..1eb161aa 100644 --- a/datalad_registry/tests/test_tasks/test_utils/test_builtin_meta_extractors.py +++ b/datalad_registry/tests/test_tasks/test_utils/test_builtin_meta_extractors.py @@ -88,6 +88,31 @@ def test_no_document(self, dandi_repo_url_with_up_to_date_clone, flask_app): dlreg_dandi_meta_extract(repo_url) +class TestDlregDandiFilesMetaExtract: + def test_valid_input(self, dandi_repo_url_with_up_to_date_clone, flask_app): + """ + Test the case that the argument `url` is a valid `RepoUrl` object with a + valid corresponding dandi dataset in the local cache + """ + from datalad_registry.tasks.utils.builtin_meta_extractors import ( + dlreg_dandi_files_meta_extract, + ) + + repo_url = dandi_repo_url_with_up_to_date_clone[0] + ds_clone = dandi_repo_url_with_up_to_date_clone[2] + + with flask_app.app_context(): + url_metadata = dlreg_dandi_files_meta_extract(repo_url) + + assert url_metadata.dataset_describe == get_head_describe(ds_clone) + assert url_metadata.dataset_version == ds_clone.repo.get_hexsha() + assert url_metadata.extractor_name == "dandi:files" + assert url_metadata.extractor_version == "0.0.1" + assert url_metadata.extraction_parameter == {} + assert url_metadata.extracted_metadata == [{"asset_id": "123"}] + assert url_metadata.url == repo_url + + class TestDlregMetaExtract: def test_unsupported_extractor( self, dandi_repo_url_with_up_to_date_clone, flask_app diff --git a/docker-compose.read-only.yml b/docker-compose.read-only.yml index 14c18bee..330c545c 100644 --- a/docker-compose.read-only.yml +++ b/docker-compose.read-only.yml @@ -10,7 +10,7 @@ services: read-only-db: condition: service_healthy ports: - - "${READ_ONLY_WEB_PUBLISH_PORT}:5000" + - "${WEB_PORT_AT_HOST}:5000" environment: FLASK_APP: "datalad_registry:create_app" diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 3158d076..fcee3ef2 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -6,8 +6,8 @@ services: RABBITMQ_DEFAULT_USER: "${RABBITMQ_DEFAULT_USER}" RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}" ports: - - "127.0.0.1:35672:5672" - - "127.0.0.1:45672:15672" + - "127.0.0.1:${BROKER_PORT_AT_HOST}:5672" + - "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672" userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly # See https://github.com/containers/podman-compose/issues/166 # for details. @@ -23,7 +23,7 @@ services: backend: image: docker.io/redis:7 ports: - - "127.0.0.1:36379:6379" + - "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379" db: image: docker.io/postgres:15 @@ -33,7 +33,7 @@ services: POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}" POSTGRES_INITDB_ARGS: --encoding utf8 --locale C ports: - - "127.0.0.1:35432:5432" + - "127.0.0.1:${DB_PORT_AT_HOST}:5432" healthcheck: test: [ "CMD", "pg_isready", "-U", "${POSTGRES_USER}", "-d", "${POSTGRES_DB}", "-q" ] interval: 30s diff --git a/docker-compose.yml b/docker-compose.yml index ff1b32ab..8aa1413b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ services: db: condition: service_healthy ports: - - "5000:5000" + - "${WEB_PORT_AT_HOST}:5000" environment: &env FLASK_APP: "datalad_registry:create_app" @@ -89,7 +89,7 @@ services: FLOWER_NATURAL_TIME: "True" FLOWER_BASIC_AUTH: "$FLOWER_BASIC_AUTH" ports: - - "127.0.0.1:5555:5555" + - "127.0.0.1:${MONITOR_PORT_AT_HOST}:5555" command: [ "/sbin/my_init", "--", "celery", "-A", "datalad_registry.make_celery:celery_app", "flower" ] volumes: - ${MONITOR_PATH_AT_HOST}/data:/data @@ -108,8 +108,8 @@ services: RABBITMQ_DEFAULT_PASS: "${RABBITMQ_DEFAULT_PASS}" RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: "-rabbit consumer_timeout 43200000" # 12 hours in milliseconds ports: - - "127.0.0.1:5672:5672" - - "127.0.0.1:15672:15672" + - "127.0.0.1:${BROKER_PORT_AT_HOST}:5672" + - "127.0.0.1:${BROKER_MANAGEMENT_PORT_AT_HOST}:15672" userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly # See https://github.com/containers/podman-compose/issues/166 # for details. @@ -128,7 +128,7 @@ services: backend: image: docker.io/redis:7 ports: - - "127.0.0.1:6379:6379" + - "127.0.0.1:${BACKEND_PORT_AT_HOST}:6379" db: image: docker.io/postgres:15 @@ -138,7 +138,7 @@ services: POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}" POSTGRES_INITDB_ARGS: --encoding utf8 --locale C ports: - - "5432:5432" + - "${DB_PORT_AT_HOST}:5432" userns_mode: "keep-id" # This has an effect only after podman-compose 1.0.3 possibly # See https://github.com/containers/podman-compose/issues/166 # for details. diff --git a/env.test b/env.test index 9de5730e..9219bd32 100644 --- a/env.test +++ b/env.test @@ -5,6 +5,12 @@ # within the same host. COMPOSE_PROJECT_NAME=dl-registry-test +# Ports of the services used for testing at host +BROKER_PORT_AT_HOST=35672 +BROKER_MANAGEMENT_PORT_AT_HOST=45672 +BACKEND_PORT_AT_HOST=36379 +DB_PORT_AT_HOST=35432 + # Variables related to the broker service RABBITMQ_DEFAULT_USER=tester RABBITMQ_DEFAULT_PASS=testpass diff --git a/template.env b/template.env index 8c662ad1..2afa7fee 100644 --- a/template.env +++ b/template.env @@ -2,6 +2,9 @@ # It is to be copied to a target file named `.env.dev` or `.env.prod`, and the target # file is to be modified (changing usernames, passwords, etc.). +# The name of the Docker Compose project (stack) +COMPOSE_PROJECT_NAME=datalad-registry + # Bind mount paths at host WEB_PATH_AT_HOST=./services/web WORKER_PATH_AT_HOST=./services/worker @@ -10,6 +13,14 @@ MONITOR_PATH_AT_HOST=./services/monitor BROKER_PATH_AT_HOST=./services/broker DB_PATH_AT_HOST=./services/db +# Ports of the services at host +WEB_PORT_AT_HOST=5000 +MONITOR_PORT_AT_HOST=5555 +BROKER_PORT_AT_HOST=5672 +BROKER_MANAGEMENT_PORT_AT_HOST=15672 +BACKEND_PORT_AT_HOST=6379 +DB_PORT_AT_HOST=5432 + DATALAD_REGISTRY_OPERATION_MODE=DEVELOPMENT # or PRODUCTION # Variables related to the broker service diff --git a/template.env.read-only b/template.env.read-only index 0b463886..0d8e5331 100644 --- a/template.env.read-only +++ b/template.env.read-only @@ -9,6 +9,9 @@ COMPOSE_PROJECT_NAME=dl-registry-read-only WEB_PATH_AT_HOST=./services/read-only-web DB_PATH_AT_HOST=./services/read-only-db +# Ports of the service(s) at host +WEB_PORT_AT_HOST=5000 + # Variables related to the db service POSTGRES_DB=pgdb POSTGRES_USER=pguser @@ -16,7 +19,3 @@ POSTGRES_PASSWORD=pgpass # (Make sure that user name and password characters do not need to be escaped for URL format # or to escape them properly if they do) SQLALCHEMY_DATABASE_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@read-only-db:5432/${POSTGRES_DB}" - - -# The port on the host to which the read-only-web service will be bound -READ_ONLY_WEB_PUBLISH_PORT=5000 diff --git a/tools/run_dandi_files_extractor.py b/tools/run_dandi_files_extractor.py new file mode 100644 index 00000000..b1b00fff --- /dev/null +++ b/tools/run_dandi_files_extractor.py @@ -0,0 +1,19 @@ +# This script initiates Celery tasks to run the `dandi:files` on each processed repo. + +from sqlalchemy import select + +from datalad_registry import create_app +from datalad_registry.models import RepoUrl, db +from datalad_registry.tasks import extract_ds_meta + +flask_app = create_app() + +with flask_app.app_context(): + + # Get the IDs of the processed repo URLs + processed_url_ids = ( + db.session.execute(select(RepoUrl.id).filter(RepoUrl.processed)).scalars().all() + ) + + for url_id in processed_url_ids: + extract_ds_meta.delay(url_id, "dandi:files")