Skip to content

Commit

Permalink
Optimization fastrun
Browse files Browse the repository at this point in the history
  • Loading branch information
LeMyst committed Jan 2, 2024
1 parent 187d9a6 commit d91a485
Showing 1 changed file with 57 additions and 28 deletions.
85 changes: 57 additions & 28 deletions wikibaseintegrator/wbi_fastrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,13 @@ def _load_qualifiers(self, sid: str, limit: int | None = None) -> Qualifiers:
"""
offset = 0

if not isinstance(sid, str):
raise ValueError('sid must be a string')

limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore

# TODO: Add cache

# We force a refresh of the data, remove the previous results
qualifiers: Qualifiers = Qualifiers()
while True:
Expand Down Expand Up @@ -261,6 +266,10 @@ def _load_references(self, sid: str, limit: int = 10000) -> References:
if not isinstance(sid, str):
raise ValueError('sid must be a string')

limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore

# TODO: Add cache

# We force a refresh of the data, remove the previous results
references: References = References()
while True:
Expand Down Expand Up @@ -326,6 +335,10 @@ def _load_rank(self, sid: str) -> WikibaseRank | None:
if not isinstance(sid, str):
raise ValueError('sid must be a string')

# TODO: Add limit?

# TODO: Add cache

query = f'''
#Tool: WikibaseIntegrator wbi_fastrun._load_rank
SELECT ?rank WHERE {{
Expand Down Expand Up @@ -501,44 +514,60 @@ def contains(in_list, lambda_filter):

# If the property is already found, load it completely to compare deeply
for claim in claims:
# Check if the property is in the filter
if claim.mainsnak.property_number in property_filter:
sparql_value = claim.get_sparql_value()
# If the value exist in the cache
if sparql_value and claim.mainsnak.property_number in self.data and sparql_value in self.data[claim.mainsnak.property_number]:
for statement in self.data[claim.mainsnak.property_number][sparql_value]:
if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter:
continue
if statement['entity'] in common_entities:
if use_qualifiers:
qualifiers = self._load_qualifiers(statement['sid'], limit=100)

if len(qualifiers) != len(claim.qualifiers):
logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers))
return True

for qualifier in qualifiers:
if qualifier not in claim.qualifiers:
logging.debug("Difference between two qualifiers")
entity_cache = [statement['entity'].rsplit('/', 1)[-1] for statement in self.data[claim.mainsnak.property_number][sparql_value]]
if entity_filter:
common_cache_filter = [value for value in entity_cache if value in entity_filter]
else:
common_cache_filter = entity_cache
# If there is common entities between the cache and the entity_filter
if common_cache_filter:
for statement in self.data[claim.mainsnak.property_number][sparql_value]:
if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter:
continue

if statement['entity'] in common_entities:
if use_qualifiers:
qualifiers = self._load_qualifiers(statement['sid'], limit=100)

if len(qualifiers) != len(claim.qualifiers):
logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers))
return True

if use_references:
references = self._load_references(statement['sid'], limit=100)
for qualifier in qualifiers:
if qualifier not in claim.qualifiers:
logging.debug("Difference between two qualifiers")
return True

if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references):
logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references))
return True
if use_references:
references = self._load_references(statement['sid'], limit=100)

for reference in references:
if reference not in claim.references:
logging.debug("Difference between two references")
if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references):
logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references))
return True

if use_rank:
rank = self._load_rank(statement['sid'])
for reference in references:
if reference not in claim.references:
logging.debug("Difference between two references")
return True

if claim.rank != rank:
logging.debug("Difference with the rank")
return True
# TODO: Add use_rank to compare rank ?
if use_rank:
rank = self._load_rank(statement['sid'])

if claim.rank != rank:
logging.debug("Difference with the rank")
return True
else:
logging.debug("No common entities between cache and entity_filter")
return True
# Enable this if the value doesn't exist ?
else:
logging.debug("Value doesn't already exist in an entity")
return True

return False

Expand Down

0 comments on commit d91a485

Please sign in to comment.