From d91a4855774db7333ada8ac24895c1fab540d49a Mon Sep 17 00:00:00 2001 From: Myst <1592048+LeMyst@users.noreply.github.com> Date: Tue, 2 Jan 2024 22:39:45 +0100 Subject: [PATCH] Optimization fastrun --- wikibaseintegrator/wbi_fastrun.py | 85 +++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/wikibaseintegrator/wbi_fastrun.py b/wikibaseintegrator/wbi_fastrun.py index e32da337..45d8a6f4 100644 --- a/wikibaseintegrator/wbi_fastrun.py +++ b/wikibaseintegrator/wbi_fastrun.py @@ -206,8 +206,13 @@ def _load_qualifiers(self, sid: str, limit: int | None = None) -> Qualifiers: """ offset = 0 + if not isinstance(sid, str): + raise ValueError('sid must be a string') + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore + # TODO: Add cache + # We force a refresh of the data, remove the previous results qualifiers: Qualifiers = Qualifiers() while True: @@ -261,6 +266,10 @@ def _load_references(self, sid: str, limit: int = 10000) -> References: if not isinstance(sid, str): raise ValueError('sid must be a string') + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore + + # TODO: Add cache + # We force a refresh of the data, remove the previous results references: References = References() while True: @@ -326,6 +335,10 @@ def _load_rank(self, sid: str) -> WikibaseRank | None: if not isinstance(sid, str): raise ValueError('sid must be a string') + # TODO: Add limit? + + # TODO: Add cache + query = f''' #Tool: WikibaseIntegrator wbi_fastrun._load_rank SELECT ?rank WHERE {{ @@ -501,44 +514,60 @@ def contains(in_list, lambda_filter): # If the property is already found, load it completely to compare deeply for claim in claims: + # Check if the property is in the filter if claim.mainsnak.property_number in property_filter: sparql_value = claim.get_sparql_value() + # If the value exist in the cache if sparql_value and claim.mainsnak.property_number in self.data and sparql_value in self.data[claim.mainsnak.property_number]: - for statement in self.data[claim.mainsnak.property_number][sparql_value]: - if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter: - continue - if statement['entity'] in common_entities: - if use_qualifiers: - qualifiers = self._load_qualifiers(statement['sid'], limit=100) - - if len(qualifiers) != len(claim.qualifiers): - logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers)) - return True - - for qualifier in qualifiers: - if qualifier not in claim.qualifiers: - logging.debug("Difference between two qualifiers") + entity_cache = [statement['entity'].rsplit('/', 1)[-1] for statement in self.data[claim.mainsnak.property_number][sparql_value]] + if entity_filter: + common_cache_filter = [value for value in entity_cache if value in entity_filter] + else: + common_cache_filter = entity_cache + # If there is common entities between the cache and the entity_filter + if common_cache_filter: + for statement in self.data[claim.mainsnak.property_number][sparql_value]: + if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter: + continue + + if statement['entity'] in common_entities: + if use_qualifiers: + qualifiers = self._load_qualifiers(statement['sid'], limit=100) + + if len(qualifiers) != len(claim.qualifiers): + logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers)) return True - if use_references: - references = self._load_references(statement['sid'], limit=100) + for qualifier in qualifiers: + if qualifier not in claim.qualifiers: + logging.debug("Difference between two qualifiers") + return True - if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references): - logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references)) - return True + if use_references: + references = self._load_references(statement['sid'], limit=100) - for reference in references: - if reference not in claim.references: - logging.debug("Difference between two references") + if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references): + logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references)) return True - if use_rank: - rank = self._load_rank(statement['sid']) + for reference in references: + if reference not in claim.references: + logging.debug("Difference between two references") + return True - if claim.rank != rank: - logging.debug("Difference with the rank") - return True - # TODO: Add use_rank to compare rank ? + if use_rank: + rank = self._load_rank(statement['sid']) + + if claim.rank != rank: + logging.debug("Difference with the rank") + return True + else: + logging.debug("No common entities between cache and entity_filter") + return True + # Enable this if the value doesn't exist ? + else: + logging.debug("Value doesn't already exist in an entity") + return True return False