diff --git a/nmdc_schema/migrators/migrator_from_11_0_3_to_11_1_0.py b/nmdc_schema/migrators/migrator_from_11_0_3_to_11_1_0.py new file mode 100644 index 0000000000..d10f991581 --- /dev/null +++ b/nmdc_schema/migrators/migrator_from_11_0_3_to_11_1_0.py @@ -0,0 +1,33 @@ +from nmdc_schema.migrators.migrator_base import MigratorBase +from nmdc_schema.migrators.partials.migrator_from_11_0_3_to_11_1_0 import ( + get_migrator_classes, +) + + +class Migrator(MigratorBase): + r""" + Migrates a database between two schemas. + + Reference: https://pypi.org/project/nmdc-schema/#history + """ + + _from_version = "11.0.3" + _to_version = "11.1.0" + + def upgrade(self) -> None: + r""" + Migrates the database from conforming to the original schema, to conforming to the new schema. + + This migrator uses partial migrators. It runs them in the order in which they were designed to be run. + """ + + migrator_classes = get_migrator_classes() + num_migrators = len(migrator_classes) + for idx, migrator_class in enumerate(migrator_classes): + self.logger.info(f"Running migrator {idx + 1} of {num_migrators}") + self.logger.debug( + f"Migrating from {migrator_class.get_origin_version()} " + f"to {migrator_class.get_destination_version()}" + ) + migrator = migrator_class(adapter=self.adapter, logger=self.logger) + migrator.upgrade() \ No newline at end of file diff --git a/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/__init__.py b/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/__init__.py new file mode 100644 index 0000000000..22649dbaa2 --- /dev/null +++ b/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/__init__.py @@ -0,0 +1,25 @@ +from typing import List, Type + +from nmdc_schema.migrators.migrator_base import MigratorBase +from nmdc_schema.migrators.partials.migrator_from_11_0_3_to_11_1_0 import ( + migrator_from_11_0_3_to_11_1_0_part_1 +) + +def get_migrator_classes() -> List[Type[MigratorBase]]: + r""" + Returns a list of migrator classes in the order in which they (i.e. their `upgrade` methods) + were designed to be run. + + >>> migrator_classes = get_migrator_classes() + >>> type(migrator_classes) is list and len(migrator_classes) > 0 # the function returns a list + True + >>> from inspect import isclass + >>> all(isclass(c) for c in migrator_classes) # each list item is a class + True + >>> all(callable(getattr(c, "upgrade")) for c in migrator_classes) # each class has an `upgrade` method + True + """ + + return [ + migrator_from_11_0_3_to_11_1_0_part_1.Migrator, + ] \ No newline at end of file diff --git a/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/migrator_from_11_0_3_to_11_1_0_part_1.py b/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/migrator_from_11_0_3_to_11_1_0_part_1.py new file mode 100644 index 0000000000..da75666ce4 --- /dev/null +++ b/nmdc_schema/migrators/partials/migrator_from_11_0_3_to_11_1_0/migrator_from_11_0_3_to_11_1_0_part_1.py @@ -0,0 +1,215 @@ +from nmdc_schema.migrators.migrator_base import MigratorBase +import re + + +class Migrator(MigratorBase): + r""" + Migrates a database between two schemas. + + This migrator removes the `has_calibration` field from all documents that represent an instance of + the `NomAnalysis` and 'MetabolomicsAnalysis' class, and moves the information to its corresponding + 'MassSpectrometry` `has_calibration` slot. + + The creation of this migrator was in response to this issue: + https://github.com/microbiomedata/nmdc-schema/issues/2139 + + """ + + _from_version = "11.0.3" + _to_version = "11.1.0.part_1" + + def upgrade(self) -> None: + r""" + Migrates the database from conforming to the original schema, to conforming to the new schema. + + >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter + >>> db = { + ... 'workflow_execution_set': [ + ... {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'}, + ... {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2', 'type': 'nmdc:NomAnalysis'}, + ... {'id': 'nmdc:wfx3', 'was_informed_by': 'nmdc:dgen3', 'type': 'nmdc:MetabolomicsAnalysis'} + ... ], + ... 'data_generation_set': [ + ... {'id': 'nmdc:dgen1'}, + ... {'id': 'nmdc:dgen2'}, + ... {'id': 'nmdc:dgen3'} + ... ], + ... 'data_object_set': [ + ... {'id': 'nmdc:dobj-13-abc123'} + ... ], + ... 'calibration_set': [ + ... {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'} + ... ] + ... } + >>> a = DictionaryAdapter(database=db) + >>> m = Migrator(adapter=a) + >>> m.upgrade() + >>> any('has_calibration' in doc for doc in db['workflow_execution_set']) # Calibrations removed from workflow + False + >>> db['data_generation_set'][0] # Calibration moved to data generation + {'id': 'nmdc:dgen1', 'has_calibration': 'nmdc:calib1'} + >>> db['data_generation_set'][1] # No calibration added when value was 'false' + {'id': 'nmdc:dgen2'} + """ + + + self.adapter.process_each_document(collection_name="workflow_execution_set", pipeline=[self.store_and_remove_calibrations]) + self.adapter.process_each_document(collection_name="data_generation_set", pipeline=[self.update_data_gen_calibration]) + + def check_has_calibration(self, has_calibration_value) -> bool: + r""" + Checks for a valid data object id format (starts with 'nmdc:dobj') + + >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter + >>> db = {} + >>> a = DictionaryAdapter(database=db) + >>> m = Migrator(adapter=a) + >>> m.check_has_calibration('nmdc:dobj-13-abc123') # Valid format + True + >>> m.check_has_calibration('false') # Invalid format + False + >>> m.check_has_calibration('nmdc:something-else') # Invalid format + False + """ + + pattern = r'^nmdc:dobj' + + return bool(re.match(pattern, has_calibration_value)) + + def check_for_valid_data_object(self, data_obj_id) -> bool: + r""" + Checks database for valid data object. Returns False if not valid + + >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter + >>> db = { + ... 'data_object_set': [ + ... {'id': 'nmdc:dobj-13-abc123'}, + ... {'id': 'nmdc:dobj-13-def456'} + ... ] + ... } + >>> a = DictionaryAdapter(database=db) + >>> m = Migrator(adapter=a) + >>> m.check_for_valid_data_object('nmdc:dobj-13-abc123') # Exists in database + True + >>> m.check_for_valid_data_object('nmdc:dobj-13-nonexistent') # Doesn't exist + False + """ + + data_obj_doc = self.adapter.get_document_having_value_in_field( + collection_name="data_object_set", field_name="id", value=data_obj_id + ) + + return data_obj_doc is not None + + def store_and_remove_calibrations(self, workflow_execution_doc) -> dict: + r""" + Moves the `has_calibration` field from the `WorkflowExecution` document to + the corresponding `DataGeneration` document. + + >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter + >>> db = { + ... 'workflow_execution_set': [ + ... {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1'}, + ... {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2'} + ... ], + ... 'data_generation_set': [ + ... {'id': 'nmdc:dgen1'}, + ... {'id': 'nmdc:dgen2'} + ... ], + ... 'data_object_set': [ + ... {'id': 'nmdc:dobj-13-abc123'} + ... ], + ... 'calibration_set': [ + ... {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'} + ... ] + ... } + >>> a = DictionaryAdapter(database=db) + >>> m = Migrator(adapter=a) + >>> workflow_execution_doc = {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1'} + >>> m.store_and_remove_calibrations(workflow_execution_doc) + {'id': 'nmdc:wfx1', 'was_informed_by': 'nmdc:dgen1'} + >>> workflow_execution_doc = {'id': 'nmdc:wfx2', 'has_calibration': 'false', 'was_informed_by': 'nmdc:dgen2'} + >>> m.store_and_remove_calibrations(workflow_execution_doc) + {'id': 'nmdc:wfx2', 'was_informed_by': 'nmdc:dgen2'} + >>> workflow_execution_doc = {'id': 'nmdc:wfx3', 'has_calibration': 'invalid', 'was_informed_by': 'nmdc:dgen3'} + >>> m.store_and_remove_calibrations(workflow_execution_doc) # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ValueError: The 'has_calibration' value (invalid) in document (nmdc:wfx3) is not recognized + """ + + calibration_mapping = {} #create dictionary to store mappings + + if "has_calibration" in workflow_execution_doc: + has_calibration_data_obj_id = workflow_execution_doc.get("has_calibration") + + # If has_calibration has a string value of false, remove the slot altogether from the document + if has_calibration_data_obj_id.lower() == 'false': + workflow_execution_doc.pop("has_calibration") + + # If the has_calibration value is not a data object id or does not have a value of "false" + # raise an error. + elif not self.check_has_calibration(has_calibration_data_obj_id): + raise ValueError(f"The 'has_calibration' value ({has_calibration_data_obj_id}) in document " + f"({workflow_execution_doc['id']}) is not recognized") + + # If has_calibration is a nmdc data object identifier: + elif self.check_has_calibration(has_calibration_data_obj_id): + + if not self.check_for_valid_data_object(has_calibration_data_obj_id): + raise ValueError(f"The 'has_calibration' value ({has_calibration_data_obj_id}) in document " + f"({workflow_execution_doc['id']}) is not a valid data object. The data object does not exist") + else: + data_gen_doc = self.adapter.get_document_having_value_in_field( + collection_name="data_generation_set", field_name="id", value=workflow_execution_doc["was_informed_by"]) + + calibration_doc = self.adapter.get_document_having_value_in_field( + collection_name="calibration_set", field_name="calibration_object", value=has_calibration_data_obj_id) + + # Store has_calibrations in calibration_mapping dictionary + calibration_mapping[data_gen_doc["id"]] = calibration_doc["id"] + + if not hasattr(self, "calibration_mappings"): + self.calibration_mappings = {} + self.calibration_mappings.update(calibration_mapping) + + # Remove calibration slot after storing mappings + workflow_execution_doc.pop("has_calibration") + + return workflow_execution_doc + + def update_data_gen_calibration(self, data_gen_doc) -> dict: + r""" + Updates data generation documents with calibration information + + >>> from nmdc_schema.migrators.adapters.dictionary_adapter import DictionaryAdapter + >>> db = { + ... 'workflow_execution_set': [ + ... {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'} + ... ], + ... 'data_generation_set': [ + ... {'id': 'nmdc:dgen1'}, + ... {'id': 'nmdc:dgen2'} # doc without corresponding calibration + ... ], + ... 'data_object_set': [ + ... {'id': 'nmdc:dobj-13-abc123'} + ... ], + ... 'calibration_set': [ + ... {'id': 'nmdc:calib1', 'calibration_object': 'nmdc:dobj-13-abc123'} + ... ] + ... } + >>> a = DictionaryAdapter(database=db) + >>> m = Migrator(adapter=a) + >>> # First store calibrations + >>> workflow_execution_doc = {'id': 'nmdc:wfx1', 'has_calibration': 'nmdc:dobj-13-abc123', 'was_informed_by': 'nmdc:dgen1', 'type': 'nmdc:MetabolomicsAnalysis'} + >>> _ = m.store_and_remove_calibrations(workflow_execution_doc) # Store the calibrations first + >>> # Then test update_data_gen_calibration + >>> m.update_data_gen_calibration({'id': 'nmdc:dgen1'}) # doc with corresponding calibration + {'id': 'nmdc:dgen1', 'has_calibration': 'nmdc:calib1'} + >>> # Test document without calibration + >>> m.update_data_gen_calibration({'id': 'nmdc:dgen2'}) # doc without corresponding calibration + {'id': 'nmdc:dgen2'} + """ + + if data_gen_doc["id"] in self.calibration_mappings: + data_gen_doc["has_calibration"] = self.calibration_mappings[data_gen_doc["id"]] + return data_gen_doc diff --git a/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml b/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml index 29ae53f810..e1e87cb9c3 100644 --- a/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml +++ b/src/data/invalid/MetabolomicsAnalysis-invalid-has-slot-used.yaml @@ -2,7 +2,6 @@ id: nmdc:wfmb-11-547rwq94.1 ended_at_time: '2021-09-15T10:13:20+00:00' execution_resource: NERSC-Cori git_url: https://example.org/WorkflowExecutionActivity -has_calibration: calibration with 0.01% phosphoric acid was_informed_by: nmdc:omprc-11-d8a8da started_at_time: '2021-08-05T14:48:51+00:00' type: nmdc:MetabolomicsAnalysis diff --git a/src/data/valid/Database-interleaved.yaml b/src/data/valid/Database-interleaved.yaml index 649fd03785..f1b093ad66 100644 --- a/src/data/valid/Database-interleaved.yaml +++ b/src/data/valid/Database-interleaved.yaml @@ -3687,7 +3687,6 @@ workflow_execution_set: git_url: https://github.com/microbiomedata/metabolomics_analysis/releases/tag/v0.5.0 was_informed_by: nmdc:omprc-11-di84md started_at_time: '2023-08-02T09:00:00Z' - has_calibration: nmdc:calib-l2k-9d6j3 has_metabolite_identifications: - type: nmdc:MetaboliteIdentification highest_similarity_score: 0.88 @@ -3901,7 +3900,6 @@ workflow_execution_set: git_url: https://github.com/microbiomedata/nom_analysis/releases/tag/v0.3.2 was_informed_by: nmdc:dgms-12-dfa74b started_at_time: '2023-08-08T09:30:00Z' - has_calibration: nmdc:calib-99-v9w6 data_generation_set: - id: nmdc:dgms-99-zUCd5N type: nmdc:MassSpectrometry diff --git a/src/schema/nmdc.yaml b/src/schema/nmdc.yaml index 6fd123e6ec..bd0aeb746d 100644 --- a/src/schema/nmdc.yaml +++ b/src/schema/nmdc.yaml @@ -255,6 +255,48 @@ classes: syntax: "{id_nmdc_prefix}:chrcon-{id_shoulder}-{id_blade}$" interpolated: true + CalibrationInformation: + class_uri: nmdc:CalibrationInformation + is_a: InformationObject + description: A calibration object that is associated with a process. + slots: + - calibration_object + - internal_calibration + - calibration_target + - calibration_standard + rules: + - title: calibration_standard_if_rt + description: >- + If the calibration_target is retention_index, a calibration_standard is required. + preconditions: + slot_conditions: + calibration_target: + equals_string: retention_index + postconditions: + slot_conditions: + calibration_standard: + required: true + - title: calibration_object_if_not_internal_calibration + description: >- + If internal_calibration is false, a calibration_object is required. + preconditions: + slot_conditions: + internal_calibration: false + postconditions: + slot_conditions: + calibration_object: + required: true + slot_usage: + internal_calibration: + required: true + calibration_target: + required: true + id: + structured_pattern: + syntax: "{id_nmdc_prefix}:calib-{id_shoulder}-{id_blade}$" + interpolated: true + + FunctionalAnnotationAggMember: class_uri: nmdc:FunctionalAnnotationAggMember slots: @@ -661,7 +703,28 @@ classes: interpolated: true enums: + CalibrationTargetEnum: + permissible_values: + mass_charge_ratio: + title: m/z + aliases: + - Mass + - m/z + retention_time: + aliases: + - RT + retention_index: + aliases: + - RI + CalibrationStandardEnum: + permissible_values: + fames: + aliases: + - FAMES + alkanes: + aliases: + - Alkanes StrandedOrientationEnum: description: This enumeration specifies information about stranded RNA library preparations. @@ -885,6 +948,26 @@ enums: slots: + has_calibration: + range: CalibrationInformation + description: a calibration instance associated with a process + + calibration_object: + range: DataObject + description: the file containing calibration data object + + internal_calibration: + range: boolean + description: whether internal calibration was used, if false, external calibration was used + + calibration_target: + range: CalibrationTargetEnum + description: the target measurement of the calibration + + calibration_standard: + range: CalibrationStandardEnum + description: the reference standard(s) used for calibration + polarity_mode: range: PolarityModeEnum description: the polarity of which ions are generated and detected diff --git a/src/schema/workflow_execution_activity.yaml b/src/schema/workflow_execution_activity.yaml index 1e5814121f..5d7fcc1f0d 100644 --- a/src/schema/workflow_execution_activity.yaml +++ b/src/schema/workflow_execution_activity.yaml @@ -264,7 +264,6 @@ classes: in_subset: - workflow subset slots: - - has_calibration - has_metabolite_identifications slot_usage: id: @@ -300,8 +299,6 @@ classes: is_a: WorkflowExecution in_subset: - workflow subset - slots: - - has_calibration slot_usage: id: required: true @@ -313,47 +310,6 @@ classes: syntax: "{id_nmdc_prefix}:(omprc|dgms)-{id_shoulder}-{id_blade}$" interpolated: true - CalibrationInformation: - class_uri: nmdc:CalibrationInformation - is_a: InformationObject - description: A calibration object that is associated with a process. - slots: - - calibration_object - - internal_calibration - - calibration_target - - calibration_standard - rules: - - title: calibration_standard_if_rt - description: >- - If the calibration_target is retention_index, a calibration_standard is required. - preconditions: - slot_conditions: - calibration_target: - equals_string: retention_index - postconditions: - slot_conditions: - calibration_standard: - required: true - - title: calibration_object_if_not_internal_calibration - description: >- - If internal_calibration is false, a calibration_object is required. - preconditions: - slot_conditions: - internal_calibration: false - postconditions: - slot_conditions: - calibration_object: - required: true - slot_usage: - internal_calibration: - required: true - calibration_target: - required: true - id: - structured_pattern: - syntax: "{id_nmdc_prefix}:calib-{id_shoulder}-{id_blade}$" - interpolated: true - slots: metagenome_assembly_parameter: @@ -594,58 +550,7 @@ slots: description: >- TODO - has_calibration: - any_of: - - range: CalibrationInformation - - range: string - description: a calibration instance associated with a process - notes: >- - has_calibration slot will be removed from all WorkflowExecution classes but remain on the - MassSpectrometry class after an ingest of the appropriate set has occurred. - Once this has occurred, this slot's range can be updated to CalibrationInformation and class/slot definitions can move to nmdc.yaml. - See PR #29 in Berkeley schema. - - calibration_object: - range: DataObject - description: the file containing calibration data object - - internal_calibration: - range: boolean - description: whether internal calibration was used, if false, external calibration was used - - calibration_target: - range: CalibrationTargetEnum - description: the target measurement of the calibration - - calibration_standard: - range: CalibrationStandardEnum - description: the reference standard(s) used for calibration - has_metabolite_identifications: range: MetaboliteIdentification multivalued: true inlined_as_list: true - -enums: - CalibrationTargetEnum: - permissible_values: - mass_charge_ratio: - title: m/z - aliases: - - Mass - - m/z - retention_time: - aliases: - - RT - retention_index: - aliases: - - RI - - CalibrationStandardEnum: - permissible_values: - fames: - aliases: - - FAMES - alkanes: - aliases: - - Alkanes