-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathWikiNewsLemmatization.py
56 lines (48 loc) · 2.06 KB
/
WikiNewsLemmatization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class WikiNewsLemmatizationDataset(DatasetBase):
def __init__(self, **kwargs):
super(WikiNewsLemmatizationDataset, self).__init__(**kwargs)
@staticmethod
def metadata():
return {
"language": "ar",
"citation": """@inproceedings{mubarak-2018-build,
title = "Build Fast and Accurate Lemmatization for {A}rabic",
author = "Mubarak, Hamdy",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1181",
}""",
"link": "http://alt.qcri.org/~hmubarak/WikiNews-26-06-2015-RefLemma.xlsx",
"license": "Research Purpose Only",
"splits": {"test": "WikiNews-26-06-2015-RefLemma.txt"},
"task_type": TaskType.Other,
}
@staticmethod
def get_data_sample():
return {
"input": "جوائز",
"label": ("جوائز", "جائزة"),
}
def load_data(self, data_path, no_labels=False):
data_path = self.resolve_path(data_path)
# Format: words \t lemmas
data = []
with open(data_path, "r") as fp:
for line_idx, line in enumerate(fp):
words, lemmas = line.split("\t")
words = words.split()
lemmas = lemmas.split()
for i, w in enumerate(words):
text = w
label = lemmas[i]
# Supply origin unlemmatized word in the label as well
# to handle failed predictions in evaluate
data.append(
{"input": text, "label": (text, label), "line_number": line_idx}
)
return data