diff --git a/wikibaseintegrator/wbi_config.py b/wikibaseintegrator/wbi_config.py index cbfd458b..30999db0 100644 --- a/wikibaseintegrator/wbi_config.py +++ b/wikibaseintegrator/wbi_config.py @@ -28,5 +28,6 @@ 'SPARQL_ENDPOINT_URL': 'https://query.wikidata.org/sparql', 'WIKIBASE_URL': 'http://www.wikidata.org', 'DEFAULT_LANGUAGE': 'en', - 'DEFAULT_LEXEME_LANGUAGE': 'Q1860' + 'DEFAULT_LEXEME_LANGUAGE': 'Q1860', + 'SPARQL_QUERY_LIMIT': 10000 } diff --git a/wikibaseintegrator/wbi_fastrun.py b/wikibaseintegrator/wbi_fastrun.py index f21503ef..56d53bac 100644 --- a/wikibaseintegrator/wbi_fastrun.py +++ b/wikibaseintegrator/wbi_fastrun.py @@ -56,7 +56,7 @@ def __init__(self, base_filter: List[BaseDataType | List[BaseDataType]], base_da if self.case_insensitive: raise ValueError("Case insensitive does not work for the moment.") - def load_statements(self, claims: Union[List[Claim], Claims, Claim], use_cache: Optional[bool] = None, wb_url: Optional[str] = None, limit: int = 10000) -> None: + def load_statements(self, claims: Union[List[Claim], Claims, Claim], use_cache: Optional[bool] = None, wb_url: Optional[str] = None, limit: Optional[int] = None) -> None: """ Load the statements related to the given claims into the internal cache of the current object. @@ -75,6 +75,8 @@ def load_statements(self, claims: Union[List[Claim], Claims, Claim], use_cache: wb_url = wb_url or self.wikibase_url + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore + for claim in claims: prop_nr = claim.mainsnak.property_number @@ -168,7 +170,7 @@ def load_statements(self, claims: Union[List[Claim], Claims, Claim], use_cache: if len(results) == 0 or len(results) < limit: break - def _load_qualifiers(self, sid: str, limit: int = 10000) -> Qualifiers: + def _load_qualifiers(self, sid: str, limit: Optional[int] = None) -> Qualifiers: """ Load the qualifiers of a statement. @@ -178,6 +180,8 @@ def _load_qualifiers(self, sid: str, limit: int = 10000) -> Qualifiers: """ offset = 0 + limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore + # We force a refresh of the data, remove the previous results qualifiers: Qualifiers = Qualifiers() while True: @@ -309,12 +313,13 @@ def _get_property_type(self, prop_nr: Union[str, int]) -> str: return results - def get_entities(self, claims: Union[List[Claim], Claims, Claim], use_cache: Optional[bool] = None) -> List[str]: + def get_entities(self, claims: Union[List[Claim], Claims, Claim], use_cache: Optional[bool] = None, query_limit: Optional[int] = None) -> List[str]: """ Return a list of entities who correspond to the specified claims. :param claims: A list of claims to query the SPARQL endpoint. :param use_cache: Put data returned by WDQS in cache. Enabled by default. + :param query_limit: Limit the amount of results from the SPARQL server :return: A list of entity ID. """ if isinstance(claims, Claim): @@ -322,7 +327,7 @@ def get_entities(self, claims: Union[List[Claim], Claims, Claim], use_cache: Opt elif (not isinstance(claims, list) or not all(isinstance(n, Claim) for n in claims)) and not isinstance(claims, Claims): raise ValueError("claims must be an instance of Claim or Claims or a list of Claim") - self.load_statements(claims=claims, use_cache=use_cache) + self.load_statements(claims=claims, use_cache=use_cache, limit=query_limit) result = set() for claim in claims: @@ -334,7 +339,7 @@ def get_entities(self, claims: Union[List[Claim], Claims, Claim], use_cache: Opt return list(result) def write_required(self, entity: BaseEntity, property_filter: Union[List[str], str, None] = None, use_qualifiers: Optional[bool] = None, use_references: Optional[bool] = None, - use_cache: Optional[bool] = None) -> bool: + use_cache: Optional[bool] = None, query_limit: Optional[int] = None) -> bool: """ :param entity: @@ -342,6 +347,7 @@ def write_required(self, entity: BaseEntity, property_filter: Union[List[str], s :param use_qualifiers: Use qualifiers during fastrun. Enabled by default. :param use_references: Use references during fastrun. Disabled by default. :param use_cache: Put data returned by WDQS in cache. Enabled by default. + :param query_limit: Limit the amount of results from the SPARQL server :return: a boolean True if a write is required. False otherwise. """ from wikibaseintegrator.entities import BaseEntity @@ -372,7 +378,7 @@ def contains(in_list, lambda_filter): statements_to_check: Dict[str, List[str]] = {} for claim in entity.claims: if claim.mainsnak.property_number in property_filter: - self.load_statements(claims=claim, use_cache=use_cache) + self.load_statements(claims=claim, use_cache=use_cache, limit=query_limit) if claim.mainsnak.property_number in self.data: if not contains(self.data[claim.mainsnak.property_number], (lambda x, c=claim: x == c.get_sparql_value())): # Found if a property with this value does not exist, return True if none exist