From d8f34aa80f6de3d673770cb07b874f74630d62d8 Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Wed, 6 Feb 2019 15:37:22 -0800 Subject: [PATCH] fix: synchronize session on all deletes This adds session synchronization to all delete queries in fonduer. Note that this does have performance implications. Specifically: > The 'fetch' strategy results in an additional SELECT statement emitted > and will significantly reduce performance. from [1]. [1]: https://docs.sqlalchemy.org/en/latest/orm/query.html Closes #213. --- src/fonduer/candidates/candidates.py | 6 ++++-- src/fonduer/candidates/mentions.py | 11 +++++++---- src/fonduer/features/featurizer.py | 4 ++-- src/fonduer/parser/parser.py | 2 +- src/fonduer/supervision/labeler.py | 4 ++-- tests/candidates/test_candidates.py | 6 +++--- tests/e2e/test_e2e.py | 4 ++-- 7 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/fonduer/candidates/candidates.py b/src/fonduer/candidates/candidates.py index edaccce22..a813f42f5 100644 --- a/src/fonduer/candidates/candidates.py +++ b/src/fonduer/candidates/candidates.py @@ -126,7 +126,7 @@ def clear(self, split): ) self.session.query(Candidate).filter( Candidate.type == candidate_class.__tablename__ - ).filter(Candidate.split == split).delete() + ).filter(Candidate.split == split).delete(synchronize_session="fetch") def clear_all(self, split): """Delete ALL Candidates from given split the database. @@ -135,7 +135,9 @@ def clear_all(self, split): :type split: int """ logger.info("Clearing ALL Candidates.") - self.session.query(Candidate).filter(Candidate.split == split).delete() + self.session.query(Candidate).filter(Candidate.split == split).delete( + synchronize_session="fetch" + ) def get_candidates(self, docs=None, split=0, sort=False): """Return a list of lists of the candidates associated with this extractor. diff --git a/src/fonduer/candidates/mentions.py b/src/fonduer/candidates/mentions.py index 862ad9072..eb37f6d17 100644 --- a/src/fonduer/candidates/mentions.py +++ b/src/fonduer/candidates/mentions.py @@ -460,21 +460,24 @@ def clear(self): logger.info(f"Clearing table: {mention_class.__tablename__}") self.session.query(Mention).filter_by( type=mention_class.__tablename__ - ).delete() + ).delete(synchronize_session="fetch") # Next, clear the Candidates. This is done manually because we have # no cascading relationship from candidate_subclass to Candidate. for cand_subclass in cand_subclasses: logger.info(f"Cascading to clear table: {cand_subclass}") - self.session.query(Candidate).filter_by(type=cand_subclass).delete() + self.session.query(Candidate).filter_by(type=cand_subclass).delete( + synchronize_session="fetch" + ) def clear_all(self): """Delete all Mentions from given split the database.""" logger.info("Clearing ALL Mentions.") - self.session.query(Mention).delete() + self.session.query(Mention).delete(synchronize_session="fetch") # With no Mentions, there should be no Candidates also - self.session.query(Candidate).delete() + self.session.query(Candidate).delete(synchronize_session="fetch") + logger.info("Cleared ALL Mentions (and Candidates).") def get_mentions(self, docs=None, sort=False): """Return a list of lists of the mentions associated with this extractor. diff --git a/src/fonduer/features/featurizer.py b/src/fonduer/features/featurizer.py index 030faff77..fb0ae1a29 100644 --- a/src/fonduer/features/featurizer.py +++ b/src/fonduer/features/featurizer.py @@ -202,8 +202,8 @@ def clear(self, train=False, split=0): def clear_all(self): """Delete all Features.""" logger.info("Clearing ALL Features and FeatureKeys.") - self.session.query(Feature).delete() - self.session.query(FeatureKey).delete() + self.session.query(Feature).delete(synchronize_session="fetch") + self.session.query(FeatureKey).delete(synchronize_session="fetch") def get_feature_matrices(self, cand_lists): """Load sparse matrix of Features for each candidate_class. diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py index 93364120c..c748b8f01 100644 --- a/src/fonduer/parser/parser.py +++ b/src/fonduer/parser/parser.py @@ -115,7 +115,7 @@ def clear(self, pdf_path=None): :param pdf_path: This parameter is ignored. """ - self.session.query(Context).delete() + self.session.query(Context).delete(synchronize_session="fetch") def get_last_documents(self): """Return the most recently parsed list of ``Documents``. diff --git a/src/fonduer/supervision/labeler.py b/src/fonduer/supervision/labeler.py index 9bdc793ea..2f99407ea 100644 --- a/src/fonduer/supervision/labeler.py +++ b/src/fonduer/supervision/labeler.py @@ -245,8 +245,8 @@ def clear(self, train, split, lfs=None): def clear_all(self): """Delete all Labels.""" logger.info("Clearing ALL Labels and LabelKeys.") - self.session.query(Label).delete() - self.session.query(LabelKey).delete() + self.session.query(Label).delete(synchronize_session="fetch") + self.session.query(LabelKey).delete(synchronize_session="fetch") def get_gold_labels(self, cand_lists, annotator=None): """Load sparse matrix of GoldLabels for each candidate_class. diff --git a/tests/candidates/test_candidates.py b/tests/candidates/test_candidates.py index 0ca4d6824..7f2eca29b 100644 --- a/tests/candidates/test_candidates.py +++ b/tests/candidates/test_candidates.py @@ -259,7 +259,7 @@ def do_nothing_matcher(fig): # Delete from parent class should cascade to child x = session.query(Candidate).first() - session.query(Candidate).filter_by(id=x.id).delete() + session.query(Candidate).filter_by(id=x.id).delete(synchronize_session="fetch") assert session.query(PartTemp).count() == 3878 assert session.query(Candidate).count() == 3878 @@ -415,7 +415,7 @@ def do_nothing_matcher(fig): assert len(docs[0].temps) == 24 # Test that deletion of a Candidate does not delete the Mention - session.query(PartTemp).delete() + session.query(PartTemp).delete(synchronize_session="fetch") assert session.query(PartTemp).count() == 0 assert session.query(Temp).count() == 136 assert session.query(Part).count() == 234 @@ -423,7 +423,7 @@ def do_nothing_matcher(fig): # Test deletion of Candidate if Mention is deleted assert session.query(PartVolt).count() == 3266 assert session.query(Volt).count() == 107 - session.query(Volt).delete() + session.query(Volt).delete(synchronize_session="fetch") assert session.query(Volt).count() == 0 assert session.query(PartVolt).count() == 0 diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py index bc66f7605..43f26e0a5 100644 --- a/tests/e2e/test_e2e.py +++ b/tests/e2e/test_e2e.py @@ -269,8 +269,8 @@ def test_e2e(caplog): # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1177 - session.query(Feature).delete() - session.query(FeatureKey).delete() + session.query(Feature).delete(synchronize_session="fetch") + session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6669