From b1333e01cf4c64539192a72e3c4d98cf56db42e3 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 21:09:21 +0200 Subject: [PATCH 01/12] Also perform strict checks for new database objects I think that's the missing piece to find all the instances where we might still be committing HDAs without HIDs, and I think we might have a similar bug for DatasetCollectionElements. --- lib/galaxy/model/base.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/lib/galaxy/model/base.py b/lib/galaxy/model/base.py index 8936ca1cf48c..77bc38218f3b 100644 --- a/lib/galaxy/model/base.py +++ b/lib/galaxy/model/base.py @@ -174,18 +174,32 @@ def versioned_objects_strict(iter): yield obj -if os.environ.get("GALAXY_TEST_RAISE_EXCEPTION_ON_HISTORYLESS_HDA"): - log.debug("Using strict flush checks") - versioned_objects = versioned_objects_strict # noqa: F811 +def get_before_flush_handler(): + if os.environ.get("GALAXY_TEST_RAISE_EXCEPTION_ON_HISTORYLESS_HDA"): + log.debug("Using strict flush checks") + + def before_flush(session, flush_context, instances): + for obj in session.new: + if hasattr(obj, "__strict_check_before_flush__"): + obj.__strict_check_before_flush__() + for obj in versioned_objects_strict(session.dirty): + obj.__create_version__(session) + for obj in versioned_objects_strict(session.deleted): + obj.__create_version__(session, deleted=True) + + else: + + def before_flush(session, flush_context, instances): + for obj in versioned_objects(session.dirty): + obj.__create_version__(session) + for obj in versioned_objects(session.deleted): + obj.__create_version__(session, deleted=True) + + return before_flush def versioned_session(session): - @event.listens_for(session, "before_flush") - def before_flush(session, flush_context, instances): - for obj in versioned_objects(session.dirty): - obj.__create_version__(session) - for obj in versioned_objects(session.deleted): - obj.__create_version__(session, deleted=True) + event.listens_for(session, "before_flush")(get_before_flush_handler()) def ensure_object_added_to_session(object_to_add, *, object_in_session=None, session=None) -> bool: From 1b1094be215437c272e241bd4e351ba019ef3e59 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 11:48:46 +0200 Subject: [PATCH 02/12] Set (copied) output hid before queuing job Otherwise we may end up with a history item that either temporarily or permanently lacks a hid. This could currently be encountered via implicit conversion that set `visible=False` in the when calling `datatype.convert_dataset`. --- lib/galaxy/datatypes/data.py | 6 +++++- .../display_applications/parameters.py | 18 +----------------- lib/galaxy/datatypes/protocols.py | 2 ++ lib/galaxy/model/__init__.py | 9 +++------ 4 files changed, 11 insertions(+), 24 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index cad936f06b16..1c8c53827839 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -842,7 +842,11 @@ def convert_dataset( # Make the target datatype available to the converter params["__target_datatype__"] = target_type # Run converter, job is dispatched through Queue - job, converted_datasets, *_ = converter.execute(trans, incoming=params, set_output_hid=visible, history=history) + job, converted_datasets, *_ = converter.execute( + trans, incoming=params, set_output_hid=visible, history=history, flush_job=False + ) + for converted_dataset in converted_datasets.values(): + original_dataset.attach_implicitly_converted_dataset(trans.sa_session, converted_dataset, target_type) trans.app.job_manager.enqueue(job, tool=converter) if len(params) > 0: trans.log_event(f"Converter params: {str(params)}", tool_id=converter.id) diff --git a/lib/galaxy/datatypes/display_applications/parameters.py b/lib/galaxy/datatypes/display_applications/parameters.py index 25e90032f248..7d022d38d1fb 100644 --- a/lib/galaxy/datatypes/display_applications/parameters.py +++ b/lib/galaxy/datatypes/display_applications/parameters.py @@ -11,7 +11,6 @@ from galaxy.datatypes.data import Data from galaxy.model import DatasetInstance -from galaxy.model.base import transaction from galaxy.schema.schema import DatasetState from galaxy.util import string_as_bool from galaxy.util.template import fill_template @@ -182,22 +181,7 @@ def prepare(self, other_values, dataset_hash, user_hash, trans): if target_ext and not converted_dataset: if isinstance(data, DisplayDataValueWrapper): data = data.value - new_data = next( - iter( - data.datatype.convert_dataset( - trans, data, target_ext, return_output=True, visible=False - ).values() - ) - ) - new_data.hid = data.hid - new_data.name = data.name - trans.sa_session.add(new_data) - assoc = trans.app.model.ImplicitlyConvertedDatasetAssociation( - parent=data, file_type=target_ext, dataset=new_data, metadata_safe=False - ) - trans.sa_session.add(assoc) - with transaction(trans.sa_session): - trans.sa_session.commit() + data.datatype.convert_dataset(trans, data, target_ext, return_output=True, visible=False) elif converted_dataset and converted_dataset.state == DatasetState.ERROR: raise Exception(f"Dataset conversion failed for data parameter: {self.name}") return self.get_value(other_values, dataset_hash, user_hash, trans) diff --git a/lib/galaxy/datatypes/protocols.py b/lib/galaxy/datatypes/protocols.py index bbd702538b74..5bc07c20c9f2 100644 --- a/lib/galaxy/datatypes/protocols.py +++ b/lib/galaxy/datatypes/protocols.py @@ -85,5 +85,7 @@ def has_data(self) -> bool: ... def set_peek(self) -> None: ... + def attach_implicitly_converted_dataset(self, session, new_dataset, target_ext: str) -> None: ... + class DatasetHasHidProtocol(DatasetProtocol, HasHid, Protocol): ... diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 3e78689a2a39..ade3ed17d68c 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -4736,7 +4736,7 @@ def get_converted_dataset(self, trans, target_ext, target_context=None, history= raise NoConverterException(f"A dependency ({dependency}) is missing a converter.") except KeyError: pass # No deps - new_dataset = next( + return next( iter( self.datatype.convert_dataset( trans, @@ -4750,7 +4750,6 @@ def get_converted_dataset(self, trans, target_ext, target_context=None, history= ).values() ) ) - return self.attach_implicitly_converted_dataset(trans.sa_session, new_dataset, target_ext) def attach_implicitly_converted_dataset(self, session, new_dataset, target_ext: str): new_dataset.name = self.name @@ -4760,9 +4759,6 @@ def attach_implicitly_converted_dataset(self, session, new_dataset, target_ext: ) session.add(new_dataset) session.add(assoc) - with transaction(session): - session.commit() - return new_dataset def copy_attributes(self, new_dataset): """ @@ -5134,7 +5130,8 @@ def copy_tags_to(self, copy_tags=None): self.tags.append(copied_tag) def copy_attributes(self, new_dataset): - new_dataset.hid = self.hid + if new_dataset.hid is None: + new_dataset.hid = self.hid def to_library_dataset_dataset_association( self, From e3aba2d2b0b2f325459ad3b86487146646c46e1c Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 22:51:00 +0200 Subject: [PATCH 03/12] Relax requirement for DCEs of collections that are not populated --- lib/galaxy/model/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index ade3ed17d68c..b5297e23656b 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -7146,7 +7146,8 @@ def __init__( self.element_identifier = element_identifier or str(element_index) def __strict_check_before_flush__(self): - assert self.element_object, "Dataset Collection Element without child entity detected, this is not valid" + if self.collection.populated_optimized: + assert self.element_object, "Dataset Collection Element without child entity detected, this is not valid" @property def element_type(self): From 5648478e6d413e1fa606e005f06fe0361bd18ab3 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 22:51:36 +0200 Subject: [PATCH 04/12] Adjust schema for collection elements of collections that aren't populated --- client/src/api/schema/schema.ts | 7 ++++--- lib/galaxy/schema/schema.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/client/src/api/schema/schema.ts b/client/src/api/schema/schema.ts index 817248b6b433..fa5d31f0fa3e 100644 --- a/client/src/api/schema/schema.ts +++ b/client/src/api/schema/schema.ts @@ -3751,7 +3751,7 @@ export interface components { * Element Type * @description The type of the element. Used to interpret the `object` field. */ - element_type: components["schemas"]["DCEType"]; + element_type?: components["schemas"]["DCEType"] | null; /** * Dataset Collection Element ID * @example 0123456789ABCDEF @@ -3767,10 +3767,11 @@ export interface components { * Object * @description The element's specific data depending on the value of `element_type`. */ - object: + object?: | components["schemas"]["HDAObject"] | components["schemas"]["HDADetailed"] - | components["schemas"]["DCObject"]; + | components["schemas"]["DCObject"] + | null; }; /** * DCEType diff --git a/lib/galaxy/schema/schema.py b/lib/galaxy/schema/schema.py index 277bb82f481e..5badff543962 100644 --- a/lib/galaxy/schema/schema.py +++ b/lib/galaxy/schema/schema.py @@ -971,13 +971,13 @@ class DCESummary(Model, WithModelClass): title="Element Identifier", description="The actual name of this element.", ) - element_type: DCEType = Field( - ..., + element_type: Optional[DCEType] = Field( + None, title="Element Type", description="The type of the element. Used to interpret the `object` field.", ) - object: Union[HDAObject, HDADetailed, DCObject] = Field( - ..., + object: Optional[Union[HDAObject, HDADetailed, DCObject]] = Field( + None, title="Object", description="The element's specific data depending on the value of `element_type`.", ) From bf3499734a123423a6d77aad2709d4181afbf2dc Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 23:44:40 +0200 Subject: [PATCH 05/12] Fix unitialize typo --- lib/galaxy/managers/collections.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index 8db9037b55a5..75993f74403b 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -95,7 +95,7 @@ def precreate_dataset_collection_instance( # TODO: prebuild all required HIDs and send them in so no need to flush in between. dataset_collection = self.precreate_dataset_collection( structure, - allow_unitialized_element=implicit_output_name is not None, + allow_uninitialized_element=implicit_output_name is not None, completed_collection=completed_collection, implicit_output_name=implicit_output_name, ) @@ -112,10 +112,10 @@ def precreate_dataset_collection_instance( return instance def precreate_dataset_collection( - self, structure, allow_unitialized_element=True, completed_collection=None, implicit_output_name=None + self, structure, allow_uninitialized_element=True, completed_collection=None, implicit_output_name=None ): has_structure = not structure.is_leaf and structure.children_known - if not has_structure and allow_unitialized_element: + if not has_structure and allow_uninitialized_element: dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT elif not has_structure: collection_type_description = structure.collection_type_description @@ -143,7 +143,7 @@ def precreate_dataset_collection( element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT else: element = self.precreate_dataset_collection( - substructure, allow_unitialized_element=allow_unitialized_element + substructure, allow_uninitialized_element=allow_uninitialized_element ) element = model.DatasetCollectionElement( From 5aecc0828222bb70f137273f30bfd3016c012339 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 23:45:05 +0200 Subject: [PATCH 06/12] Fall back to populated method if collection not committed yet --- lib/galaxy/model/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index b5297e23656b..4153c004fe44 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -6359,6 +6359,8 @@ def has_deferred_data(self): @property def populated_optimized(self): if not hasattr(self, "_populated_optimized"): + if not self.id: + return self.populated _populated_optimized = True if ":" not in self.collection_type: _populated_optimized = self.populated_state == DatasetCollection.populated_states.OK From d88900210ac7421eaa730e4575ab0fd1558f767b Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 23:55:41 +0200 Subject: [PATCH 07/12] Commit only after copying library item to history --- lib/galaxy/model/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 4153c004fe44..54305c2cc116 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -5818,11 +5818,9 @@ def to_history_dataset_association(self, target_history, parent_id=None, add_to_ tag_manager = galaxy.model.tags.GalaxyTagHandler(sa_session) src_ldda_tags = tag_manager.get_tags_str(self.tags) - tag_manager.apply_item_tags(user=self.user, item=hda, tags_str=src_ldda_tags) + tag_manager.apply_item_tags(user=self.user, item=hda, tags_str=src_ldda_tags, flush=False) sa_session.add(hda) - with transaction(sa_session): - sa_session.commit() - hda.metadata = self.metadata # need to set after flushed, as MetadataFiles require dataset.id + hda.metadata = self.metadata if add_to_history and target_history: target_history.add_dataset(hda) with transaction(sa_session): From e1a45a226e2e3e4511833c32de1ab264f41e7c30 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 21 Apr 2024 23:56:43 +0200 Subject: [PATCH 08/12] Drop unused copy_from_ldda method --- test/unit/app/managers/test_HDAManager.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/unit/app/managers/test_HDAManager.py b/test/unit/app/managers/test_HDAManager.py index 6c7300a44271..226b1e8d70d9 100644 --- a/test/unit/app/managers/test_HDAManager.py +++ b/test/unit/app/managers/test_HDAManager.py @@ -133,13 +133,6 @@ def test_copy_from_hda(self): hda3_annotation = self.hda_manager.annotation(hda3) assert annotation == hda3_annotation - # def test_copy_from_ldda( self ): - # owner = self.user_manager.create( self.trans, **user2_data ) - # history1 = self.history_mgr.create( self.trans, name='history1', user=owner ) - # - # self.log( "should be able to copy an HDA" ) - # hda2 = self.hda_manager.copy_ldda( history1, hda1 ) - def test_delete(self): owner = self.user_manager.create(**user2_data) history1 = self.history_manager.create(name="history1", user=owner) From a56f642631e778b0ba77474cd1ed31ffdc7002bd Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 22 Apr 2024 00:04:50 +0200 Subject: [PATCH 09/12] Drop unused method --- lib/galaxy/managers/hdas.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/galaxy/managers/hdas.py b/lib/galaxy/managers/hdas.py index 3be812fcf0e8..5fc02313c509 100644 --- a/lib/galaxy/managers/hdas.py +++ b/lib/galaxy/managers/hdas.py @@ -221,12 +221,6 @@ def copy( return copy - def copy_ldda(self, history, ldda, **kwargs): - """ - Copy this HDA as a LDDA and return. - """ - return ldda.to_history_dataset_association(history, add_to_history=True) - # .... deletion and purging def purge(self, hda, flush=True, **kwargs): if self.app.config.enable_celery_tasks: From badbd3ae78cb7cb51d5365639a99e836d1a3bd53 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 22 Apr 2024 11:06:10 +0200 Subject: [PATCH 10/12] Fix premature flush in hda_manager.copy --- lib/galaxy/managers/hdas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/managers/hdas.py b/lib/galaxy/managers/hdas.py index 5fc02313c509..9432e54b7805 100644 --- a/lib/galaxy/managers/hdas.py +++ b/lib/galaxy/managers/hdas.py @@ -202,7 +202,7 @@ def copy( if not isinstance(item, model.HistoryDatasetAssociation): raise TypeError() hda = item - copy = hda.copy(parent_id=kwargs.get("parent_id"), copy_hid=False, copy_tags=hda.tags, flush=flush) + copy = hda.copy(parent_id=kwargs.get("parent_id"), copy_hid=False, copy_tags=hda.tags, flush=False) if hide_copy: copy.visible = False if history: From c2140bd00c23cf8c02c69c7f09c237689126e301 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 22 Apr 2024 13:32:09 +0200 Subject: [PATCH 11/12] Avoid flush in set_meta by using MetadataFile.uuid --- lib/galaxy/model/deferred.py | 2 +- lib/galaxy/model/metadata.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/galaxy/model/deferred.py b/lib/galaxy/model/deferred.py index e72d4e126aca..cea7a0eba17a 100644 --- a/lib/galaxy/model/deferred.py +++ b/lib/galaxy/model/deferred.py @@ -165,7 +165,7 @@ def ensure_materialized( if transient_paths: metadata_tmp_files_dir = transient_paths.metadata_files_dir else: - # If metadata_tmp_files_dir is set we generate a MetdataTempFile, + # If metadata_tmp_files_dir is set we generate a MetadataTempFile, # which we don't want when we're generating an attached materialized dataset instance metadata_tmp_files_dir = None materialized_dataset_instance.set_meta(metadata_tmp_files_dir=metadata_tmp_files_dir) diff --git a/lib/galaxy/model/metadata.py b/lib/galaxy/model/metadata.py index b9fb7839cdbb..5fdfd8a49678 100644 --- a/lib/galaxy/model/metadata.py +++ b/lib/galaxy/model/metadata.py @@ -670,7 +670,7 @@ def from_external_value(self, value, parent, path_rewriter=None): # directory. Correct. file_name = path_rewriter(file_name) mf.update_from_file(file_name) - value = mf.id + value = str(mf.uuid) return value def to_external_value(self, value): @@ -692,11 +692,9 @@ def new_file(self, dataset=None, metadata_tmp_files_dir=None, **kwds): sa_session = object_session(dataset) if sa_session: sa_session.add(mf) - with transaction(sa_session): - sa_session.commit() # commit to assign id return mf else: - # we need to make a tmp file that is accessable to the head node, + # we need to make a tmp file that is accessible to the head node, # we will be copying its contents into the MetadataFile objects filename after restoring from JSON # we do not include 'dataset' in the kwds passed, as from_JSON_value() will handle this for us return MetadataTempFile(metadata_tmp_files_dir=metadata_tmp_files_dir, **kwds) From 121f27e2f4b9b90b14d64a103d963bd5fa3a753c Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 22 Apr 2024 16:09:22 +0200 Subject: [PATCH 12/12] Default HID to -1 if for whatever reason we didn't record a HID --- lib/galaxy/managers/hdas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy/managers/hdas.py b/lib/galaxy/managers/hdas.py index 9432e54b7805..f92a75b56f24 100644 --- a/lib/galaxy/managers/hdas.py +++ b/lib/galaxy/managers/hdas.py @@ -555,6 +555,7 @@ def add_serializers(self): annotatable.AnnotatableSerializerMixin.add_serializers(self) serializers: Dict[str, base.Serializer] = { + "hid": lambda item, key, **context: item.hid if item.hid is not None else -1, "model_class": lambda item, key, **context: "HistoryDatasetAssociation", "history_content_type": lambda item, key, **context: "dataset", "hda_ldda": lambda item, key, **context: "hda",