diff --git a/README.md b/README.md index bddb049..892acc9 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,57 @@ To get labels, descriptions, and aliases in a specific language, use `language`- enrich('Breslau', language="de") ``` +To get entities only for a certain entity types, use `entity-type`-parameter. Possible values are "org", "per", "geo" and None. Default is None, so it enriches with entities of any type. +```` +enrich('Mannheim', entity_type="geo") +``` + +You can combine those parameters: +``` +pprint(enrich('Cöln', limit=5, language="de", entity_type='geo')) +[{'error': 'Entity does not match the specified entity type', + 'id': 'Q105550033'}, + {'error': 'Entity does not match the specified entity type', + 'id': 'Q37262196'}, + {'Freebase ID': '/m/01v8c', + 'GeoNames ID': '2886242', + 'German district key': '05315', + 'German municipality key': '05315000', + 'German regional key': '053150000000', + 'OSM Relation ID': '62578', + 'aliases': ['Kölle', + 'Köln, Deutschland', + 'Köln (Deutschland)', + 'Colonia', + 'Colonia Claudia Ara Agrippinensium', + 'CCAA', + 'Cöln', + 'Cöln am Rhein'], + 'description': 'Millionenmetropole am Rhein und bevölkerungsreichste Stadt ' + 'in Nordrhein-Westfalen', + 'geographic coordinates': {'altitude': None, + 'globe': 'http://www.wikidata.org/entity/Q2', + 'latitude': 50.942222222222, + 'longitude': 6.9577777777778, + 'precision': 0.00027777777777778}, + 'id': 'Q365', + 'is_within_current_germany': True, + 'is_within_historical_GDR_1945_1949': False, + 'is_within_historical_GDR_1949_1990': False, + 'is_within_historical_GFR_1945_1949': True, + 'is_within_historical_GFR_1949_1990': True, + 'is_within_historical_GFR_1990_2019': True, + 'is_within_historical_germany_1886_1919': True, + 'is_within_historical_germany_1919_1920': True, + 'is_within_historical_germany_1920_1938': True, + 'is_within_historical_germany_1938_1945': True, + 'label': 'Köln', + 'url': 'https://www.wikidata.org/wiki/Special:EntityData/Q365'}, + {'error': 'Entity does not match the specified entity type', 'id': 'Q35872'}, + {'error': 'Entity does not match the specified entity type', + 'id': 'Q18019200'}] +``` + ## Archived code Shigapov, R. (2023). KG-enricher: An open-source Python library for enriching strings, entities and knowledge graphs using Wikibase knowledge graphs (0.1.0). Zenodo. https://doi.org/10.5281/zenodo.10405073 \ No newline at end of file diff --git a/enricher/enricher.py b/enricher/enricher.py index 820fed0..eff8c36 100644 --- a/enricher/enricher.py +++ b/enricher/enricher.py @@ -38,9 +38,6 @@ def query_wikibase(entity, limit=1, api_url=wikibase_api_url): def get_label_description_aliases(data, language='en'): - """ - Extracts label, description, and aliases from a given Wikibase entity data in the specified language. - """ label = data['labels'][language]['value'] if language in data['labels'] else None description = data['descriptions'][language]['value'] if language in data['descriptions'] else None aliases = [alias['value'] for alias in data['aliases'][language]] if language in data['aliases'] else [] @@ -62,23 +59,28 @@ def get_coordinates(entity_id, special_entitydata_url=wikibase_special_entitydat return None -def get_label_for_qid(qid, language='en', special_entitydata_url=wikibase_special_entitydata): +def determine_properties(entity_data, entity_type=None): """ - Retrieves the label for a given QID from a specified Wikibase instance in the specified language. + Determines the properties to fetch based on the entity type. + If entity_type is None, it does not filter by type and returns the appropriate properties. """ - url = f"{special_entitydata_url}{qid}.json" - response = requests.get(url).json() - data = response['entities'][qid] - return data['labels'][language]['value'] if language in data['labels'] else None - - -def determine_properties(entity_data): - if 'P31' in entity_data['claims'] and any([x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']]): - return person_properties # Person - elif coordinate_property in entity_data['claims']: - return geo_properties # Geographic entity - else: - return org_properties # Organization + if entity_type == "per": + if 'P31' in entity_data['claims'] and any(x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']): + return person_properties + elif entity_type == "geo": + if coordinate_property in entity_data['claims']: + return geo_properties + elif entity_type == "org": + return org_properties + elif entity_type is None: + # Determine properties without filtering by type + if 'P31' in entity_data['claims'] and any(x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']): + return person_properties + elif coordinate_property in entity_data['claims']: + return geo_properties + else: + return org_properties + return None def extract_information(entity_id, properties_to_fetch, language='en'): @@ -128,16 +130,23 @@ def fetch_entity_data(entity_id, special_entitydata_url=wikibase_special_entityd return None -def enrich(entity_string, limit=1, language='en'): +def enrich(entity_string, limit=1, language='en', entity_type=None): + """ + Enriches entities optionally based on the specified entity_type: "per" for person, "org" for organization, "geo" for geographic entity. + If entity_type is None, it enriches with an entity of any entity type. + """ entity_ids = query_wikibase(entity_string, limit=limit) if entity_ids: results = [] for entity_id in entity_ids: entity_data = fetch_entity_data(entity_id) if entity_data: - properties_to_fetch = determine_properties(entity_data) - enriched_data = extract_information(entity_id, properties_to_fetch, language) - results.append(enriched_data) + properties_to_fetch = determine_properties(entity_data, entity_type) + if properties_to_fetch: + enriched_data = extract_information(entity_id, properties_to_fetch, language) + results.append(enriched_data) + else: + results.append({'error': 'Entity does not match the specified entity type', 'id': entity_id}) else: results.append({'error': 'Entity data not found', 'id': entity_id}) return results