Skip to content

Commit

Permalink
fixup! [r] Support for AnVIL duos_id (#6620)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Jan 18, 2025
1 parent 3bb81f0 commit 6492eef
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
6 changes: 4 additions & 2 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,10 @@ def manifest_config(self) -> ManifestConfig:
# the fields listed here and those used in `self._field_mapping`.
fields_to_omit_from_manifest = [
('contents', 'activities', 'activity_table'),
# duos_id is omitted from manifests since the field isn't included
# in the `files` index.
# We omit the `duos_id` field from manifests since there is only one
# DUOS bundle per dataset, and that bundle only contributes to outer
# entities of the `datasets` type, not to entities of the other
# types, such as files, which the manifest is generated from.
('contents', 'datasets', 'duos_id'),
('contents', 'files', 'uuid'),
('contents', 'files', 'version'),
Expand Down
8 changes: 4 additions & 4 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,10 @@ class BundleType(Enum):
request per *bundle* instead, potentially overloading the DUOS service. Our
solution is to retrieve `description` only in a bundle of this dedicated
DUOS type, once per snapshot, and merge it with the other dataset fields
during aggregation. As a result of this implemtation, the DUOS bundle is
only added to the `datasets` and `bundles` indices, making the `duos_id`
and `description` fields unavailable for file manifests as they pull values
from the `files` index.
during aggregation. As a result, `duos_id` cannot be included in file
manifests since there is only one DUOS bundle per dataset, and that bundle
only contributes to outer entities of the `datasets` type, not to entities
of the other types, such as files, which the manifest is generated from.
All other bundles are replica bundles. Replica bundles consist of a batch of
rows from an arbitrary BigQuery table, which may or may not be described by
Expand Down
4 changes: 4 additions & 0 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,10 @@ def bundles(cls) -> list[SourcedBundleFQID]:
def test_compact_manifest(self):
response = self._get_manifest(ManifestFormat.compact, filters={})
self.assertEqual(200, response.status_code)
# The `duos_id` field is absent from manifests since there is only one
# DUOS bundle per dataset, and that bundle only contributes to outer
# entities of the `datasets` type, not to entities of the other types,
# such as files, which the manifest is generated from.
expected = [
(
'bundles.bundle_uuid',
Expand Down

0 comments on commit 6492eef

Please sign in to comment.