-
Notifications
You must be signed in to change notification settings - Fork 0
/
access_dbpedia.py
147 lines (120 loc) · 4.05 KB
/
access_dbpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Access the DBPedia SQLite database after it has been created by load_dbpedia.py.
"""
import pandas as pd
import sqlite3
import json
from typing import Any, List, Iterable
from erllm import DBFILE_PATH
from erllm.dataset.entity import Entity
def get_matches(N: int) -> List[Any]:
"""
Retrieve a list of matching pairs with all entity attributes.
Args:
N (int): The maximum number of matching pairs to retrieve.
Returns:
List[Any]: A list of tuples containing matching entities from 'dbpedia0' and 'dbpedia1'.
"""
match_query = f"""
SELECT e0.uri, e0.kv, e1.uri, e1.kv from dbpedia0 AS e0, dbpedia1 AS e1, dbpedia_matches AS matches
WHERE e0.id = matches.id0
AND e1.id = matches.id1
LIMIT {N};
"""
conn = sqlite3.connect(DBFILE_PATH)
cursor = conn.cursor()
cursor.execute(match_query)
res = cursor.fetchall()
conn.close()
return res
def get_entity_by_id(id: int, table: str) -> Entity:
"""
Retrieve an entity by its ID from the specified table.
Args:
id (int): The ID of the entity to retrieve.
table (str): The name of the table containing the entity.
Returns:
Entity: An Entity object representing the retrieved entity.
Raises:
ValueError: If the entity with the specified ID is not found in the table.
"""
conn = sqlite3.connect(DBFILE_PATH)
cursor = conn.cursor()
# Define the SQL query to retrieve an entry by its id
cursor.execute(f"SELECT id, uri, kv FROM {table} WHERE id = ?", (id,))
entry = cursor.fetchone() # Fetch the first matching entry
conn.close()
if entry:
id, uri, kv_json = entry
kv = json.loads(kv_json)
return Entity(id, uri, kv)
else:
raise ValueError(f"Entity with id {id} not found in table {table}")
def get_random_matches(n: int) -> List[Any]:
"""
Retrieve a list of random matching pair ids from the 'dbpedia_matches' table.
Args:
n (int): The number of random matching pairs to retrieve.
Returns:
List[Any]: A list of tuples containing the ids of random matching pairs.
"""
conn = sqlite3.connect(DBFILE_PATH)
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM dbpedia_matches ORDER BY RANDOM() LIMIT {n}")
random_matches = cursor.fetchall()
conn.close()
return random_matches
def is_match(id0: int, id1: int) -> bool:
"""
Check if the entities are a match by checking the 'dbpedia_matches' table.
Args:
id0 (int): The ID of the first entity.
id1 (int): The ID of the second entity.
Returns:
bool: True if a matching pair exists, False otherwise.
"""
conn = sqlite3.connect(DBFILE_PATH)
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM dbpedia_matches WHERE id0 = ? AND id1 = ?", (id0, id1)
)
match_count = cursor.fetchone()[0]
conn.close()
return match_count > 0
def get_number_of_entries(table: str) -> int:
"""
Get the total number of entries in the specified table.
Args:
table (str): The name of the table.
Returns:
int: The number of entries in the table.
"""
conn = sqlite3.connect(DBFILE_PATH)
cursor = conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
conn.close()
return count
def entities_from_dbpedia_df(
df: pd.DataFrame,
) -> Iterable[tuple[int, Entity, Entity]]:
"""
Extracts entities from a DBpedia CSV file.
Args:
dbpedia_csv (Path): The path to the DBpedia CSV file.
Returns:
Iterable[tuple[Entity, Entity, int]]: A list of tuples containing the entities and their labels.
"""
pair_ids = [
(row["table1.id"], row["table2.id"], row["label"]) for _, row in df.iterrows()
]
pair_entities = []
for id0, id1, label in pair_ids:
pair_entities.append(
(
int(label),
get_entity_by_id(id0, "dbpedia0"),
get_entity_by_id(id1, "dbpedia1"),
)
)
return pair_entities