Skip to content

Commit

Permalink
add optional entity_type parameter; resolves #3
Browse files Browse the repository at this point in the history
  • Loading branch information
shigapov committed Oct 29, 2024
1 parent 7209d6d commit e00e187
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 22 deletions.
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,57 @@ To get labels, descriptions, and aliases in a specific language, use `language`-
enrich('Breslau', language="de")
```
To get entities only for a certain entity types, use `entity-type`-parameter. Possible values are "org", "per", "geo" and None. Default is None, so it enriches with entities of any type.
````
enrich('Mannheim', entity_type="geo")
```

You can combine those parameters:
```
pprint(enrich('Cöln', limit=5, language="de", entity_type='geo'))
[{'error': 'Entity does not match the specified entity type',
'id': 'Q105550033'},
{'error': 'Entity does not match the specified entity type',
'id': 'Q37262196'},
{'Freebase ID': '/m/01v8c',
'GeoNames ID': '2886242',
'German district key': '05315',
'German municipality key': '05315000',
'German regional key': '053150000000',
'OSM Relation ID': '62578',
'aliases': ['Kölle',
'Köln, Deutschland',
'Köln (Deutschland)',
'Colonia',
'Colonia Claudia Ara Agrippinensium',
'CCAA',
'Cöln',
'Cöln am Rhein'],
'description': 'Millionenmetropole am Rhein und bevölkerungsreichste Stadt '
'in Nordrhein-Westfalen',
'geographic coordinates': {'altitude': None,
'globe': 'http://www.wikidata.org/entity/Q2',
'latitude': 50.942222222222,
'longitude': 6.9577777777778,
'precision': 0.00027777777777778},
'id': 'Q365',
'is_within_current_germany': True,
'is_within_historical_GDR_1945_1949': False,
'is_within_historical_GDR_1949_1990': False,
'is_within_historical_GFR_1945_1949': True,
'is_within_historical_GFR_1949_1990': True,
'is_within_historical_GFR_1990_2019': True,
'is_within_historical_germany_1886_1919': True,
'is_within_historical_germany_1919_1920': True,
'is_within_historical_germany_1920_1938': True,
'is_within_historical_germany_1938_1945': True,
'label': 'Köln',
'url': 'https://www.wikidata.org/wiki/Special:EntityData/Q365'},
{'error': 'Entity does not match the specified entity type', 'id': 'Q35872'},
{'error': 'Entity does not match the specified entity type',
'id': 'Q18019200'}]
```

## Archived code

Shigapov, R. (2023). KG-enricher: An open-source Python library for enriching strings, entities and knowledge graphs using Wikibase knowledge graphs (0.1.0). Zenodo. https://doi.org/10.5281/zenodo.10405073
53 changes: 31 additions & 22 deletions enricher/enricher.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def query_wikibase(entity, limit=1, api_url=wikibase_api_url):


def get_label_description_aliases(data, language='en'):
"""
Extracts label, description, and aliases from a given Wikibase entity data in the specified language.
"""
label = data['labels'][language]['value'] if language in data['labels'] else None
description = data['descriptions'][language]['value'] if language in data['descriptions'] else None
aliases = [alias['value'] for alias in data['aliases'][language]] if language in data['aliases'] else []
Expand All @@ -62,23 +59,28 @@ def get_coordinates(entity_id, special_entitydata_url=wikibase_special_entitydat
return None


def get_label_for_qid(qid, language='en', special_entitydata_url=wikibase_special_entitydata):
def determine_properties(entity_data, entity_type=None):
"""
Retrieves the label for a given QID from a specified Wikibase instance in the specified language.
Determines the properties to fetch based on the entity type.
If entity_type is None, it does not filter by type and returns the appropriate properties.
"""
url = f"{special_entitydata_url}{qid}.json"
response = requests.get(url).json()
data = response['entities'][qid]
return data['labels'][language]['value'] if language in data['labels'] else None


def determine_properties(entity_data):
if 'P31' in entity_data['claims'] and any([x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']]):
return person_properties # Person
elif coordinate_property in entity_data['claims']:
return geo_properties # Geographic entity
else:
return org_properties # Organization
if entity_type == "per":
if 'P31' in entity_data['claims'] and any(x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']):
return person_properties
elif entity_type == "geo":
if coordinate_property in entity_data['claims']:
return geo_properties
elif entity_type == "org":
return org_properties
elif entity_type is None:
# Determine properties without filtering by type
if 'P31' in entity_data['claims'] and any(x['mainsnak']['datavalue']['value']['id'] == 'Q5' for x in entity_data['claims']['P31']):
return person_properties
elif coordinate_property in entity_data['claims']:
return geo_properties
else:
return org_properties
return None


def extract_information(entity_id, properties_to_fetch, language='en'):
Expand Down Expand Up @@ -128,16 +130,23 @@ def fetch_entity_data(entity_id, special_entitydata_url=wikibase_special_entityd
return None


def enrich(entity_string, limit=1, language='en'):
def enrich(entity_string, limit=1, language='en', entity_type=None):
"""
Enriches entities optionally based on the specified entity_type: "per" for person, "org" for organization, "geo" for geographic entity.
If entity_type is None, it enriches with an entity of any entity type.
"""
entity_ids = query_wikibase(entity_string, limit=limit)
if entity_ids:
results = []
for entity_id in entity_ids:
entity_data = fetch_entity_data(entity_id)
if entity_data:
properties_to_fetch = determine_properties(entity_data)
enriched_data = extract_information(entity_id, properties_to_fetch, language)
results.append(enriched_data)
properties_to_fetch = determine_properties(entity_data, entity_type)
if properties_to_fetch:
enriched_data = extract_information(entity_id, properties_to_fetch, language)
results.append(enriched_data)
else:
results.append({'error': 'Entity does not match the specified entity type', 'id': entity_id})
else:
results.append({'error': 'Entity data not found', 'id': entity_id})
return results
Expand Down

0 comments on commit e00e187

Please sign in to comment.