From 6492eef670c2bcc37cf6305040da6c0f292ccfe6 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 17 Jan 2025 17:09:09 -0800 Subject: [PATCH] fixup! [r] Support for AnVIL duos_id (#6620) --- src/azul/plugins/metadata/anvil/__init__.py | 6 ++++-- src/azul/plugins/repository/tdr_anvil/__init__.py | 8 ++++---- test/service/test_manifest.py | 4 ++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index e3fe124a84..292a40b058 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -288,8 +288,10 @@ def manifest_config(self) -> ManifestConfig: # the fields listed here and those used in `self._field_mapping`. fields_to_omit_from_manifest = [ ('contents', 'activities', 'activity_table'), - # duos_id is omitted from manifests since the field isn't included - # in the `files` index. + # We omit the `duos_id` field from manifests since there is only one + # DUOS bundle per dataset, and that bundle only contributes to outer + # entities of the `datasets` type, not to entities of the other + # types, such as files, which the manifest is generated from. ('contents', 'datasets', 'duos_id'), ('contents', 'files', 'uuid'), ('contents', 'files', 'version'), diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index bd91801b43..6b489a98d5 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -136,10 +136,10 @@ class BundleType(Enum): request per *bundle* instead, potentially overloading the DUOS service. Our solution is to retrieve `description` only in a bundle of this dedicated DUOS type, once per snapshot, and merge it with the other dataset fields - during aggregation. As a result of this implemtation, the DUOS bundle is - only added to the `datasets` and `bundles` indices, making the `duos_id` - and `description` fields unavailable for file manifests as they pull values - from the `files` index. + during aggregation. As a result, `duos_id` cannot be included in file + manifests since there is only one DUOS bundle per dataset, and that bundle + only contributes to outer entities of the `datasets` type, not to entities + of the other types, such as files, which the manifest is generated from. All other bundles are replica bundles. Replica bundles consist of a batch of rows from an arbitrary BigQuery table, which may or may not be described by diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 8fe99a5893..7bdae5e620 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1760,6 +1760,10 @@ def bundles(cls) -> list[SourcedBundleFQID]: def test_compact_manifest(self): response = self._get_manifest(ManifestFormat.compact, filters={}) self.assertEqual(200, response.status_code) + # The `duos_id` field is absent from manifests since there is only one + # DUOS bundle per dataset, and that bundle only contributes to outer + # entities of the `datasets` type, not to entities of the other types, + # such as files, which the manifest is generated from. expected = [ ( 'bundles.bundle_uuid',