From 90325597df357b8aeb7f811ffede0e8137c62643 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 30 Oct 2024 11:03:18 -0700 Subject: [PATCH] [r] Support for AnVIL duos_id (#6620) --- src/azul/plugins/metadata/anvil/__init__.py | 5 +++++ src/azul/plugins/metadata/anvil/indexer/transform.py | 1 + src/azul/plugins/metadata/anvil/service/response.py | 1 + src/azul/plugins/repository/tdr_anvil/__init__.py | 4 +++- .../2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json | 1 + test/indexer/test_anvil.py | 8 ++++++-- test/service/data/verbatim/anvil/pfb_entities.json | 2 ++ test/service/data/verbatim/anvil/pfb_schema.json | 8 ++++++++ test/service/test_manifest.py | 6 ++++++ 9 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index ab4c55441c..35be1410e3 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -156,6 +156,7 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'registered_identifier', 'title', 'data_modality', + 'duos_id', ] }, 'donors': { @@ -351,6 +352,10 @@ def verbatim_pfb_schema(self, is_polymorphic=is_duos_type) ] if is_duos_type: + field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, + column_name='duos_id', + anvil_datatype='string', + is_polymorphic=True)) field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, column_name='description', anvil_datatype='string', diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 9e2f4f22ed..2cd04abbdb 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -498,6 +498,7 @@ def _duos_types(cls) -> FieldTypes: return { 'document_id': null_str, 'description': null_str, + 'duos_id': null_str, } def _duos(self, dataset: EntityReference) -> MutableJSON: diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 8d0be3c129..6175bd6473 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -210,6 +210,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]: }, 'datasets': { 'dataset_id', + 'duos_id', 'title' }, 'diagnoses': { diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index e841615931..e5564fcce6 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -457,6 +457,7 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: assert not bundle_fqid.is_batched, bundle_fqid duos_info = self.tdr.get_duos(bundle_fqid.source) + duos_id = None if duos_info is None else one(duos_info['consentGroups'])['datasetIdentifier'] description = None if duos_info is None else duos_info.get('studyDescription') ref, row = self._get_dataset(bundle_fqid.source.spec) expected_entity_id = change_version(bundle_fqid.uuid, @@ -464,7 +465,8 @@ def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: self.datarepo_row_uuid_version) assert ref.entity_id == expected_entity_id, (ref, bundle_fqid) bundle = TDRAnvilBundle(fqid=bundle_fqid) - bundle.add_entity(ref, self._version, {'description': description}) + entity_row = {'duos_id': duos_id, 'description': description} + bundle.add_entity(ref, self._version, entity_row) # Classify as orphan to suppress the emission of a contribution bundle.add_entity(ref, self._version, dict(row), is_orphan=True) return bundle diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json index 9859b200f1..b029436606 100644 --- a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json +++ b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json @@ -2,6 +2,7 @@ "entities": { "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "version": "2022-06-01T00:00:00.000000Z" } }, diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 1cbc52ba0e..e6f0114dd0 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -75,7 +75,7 @@ def setUpClass(cls) -> None: mock_duos_url = furl('https:://mock_duos.lan') - duos_id = 'foo' + duos_id = 'DUOS-000000' duos_description = 'Study description from DUOS' @classmethod @@ -93,6 +93,9 @@ def _patch_duos(cls) -> None: } })), Mock(spec=HTTPResponse, status=200, data=json.dumps({ + 'consentGroups': [{ + 'datasetIdentifier': cls.duos_id + }], 'studyDescription': cls.duos_description })) ])) @@ -251,8 +254,9 @@ def test_dataset_description(self): # These fields are populated only in the primary bundle self.assertEqual(dataset_ref.entity_id, contents['document_id']) self.assertEqual(['phs000693'], contents['registered_identifier']) - # This field is populated only in the DUOS bundle + # These fields are populated only in the DUOS bundle self.assertEqual('Study description from DUOS', contents['description']) + self.assertEqual('DUOS-000000', contents['duos_id']) else: self.fail(qualifier) self.assertDictEqual(doc_counts, { diff --git a/test/service/data/verbatim/anvil/pfb_entities.json b/test/service/data/verbatim/anvil/pfb_entities.json index 29d14d974e..7c24f88c61 100644 --- a/test/service/data/verbatim/anvil/pfb_entities.json +++ b/test/service/data/verbatim/anvil/pfb_entities.json @@ -103,6 +103,7 @@ "datarepo_row_id": null, "dataset_id": null, "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "owner": null, "principal_investigator": null, "registered_identifier": null, @@ -265,6 +266,7 @@ "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", "description": null, + "duos_id": null, "owner": [ "Debbie Nickerson" ], diff --git a/test/service/data/verbatim/anvil/pfb_schema.json b/test/service/data/verbatim/anvil/pfb_schema.json index 07aee95c55..d27626effc 100644 --- a/test/service/data/verbatim/anvil/pfb_schema.json +++ b/test/service/data/verbatim/anvil/pfb_schema.json @@ -560,6 +560,14 @@ "string" ] }, + { + "name": "duos_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, { "name": "owner", "namespace": "anvil_dataset", diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index dac94047db..b54787608a 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1801,6 +1801,12 @@ def test_compact_manifest(self): '', '' ), + ( + 'datasets.duos_id', + '', + '', + '', + ), ( 'donors.document_id', '',