-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathLocation.py
70 lines (63 loc) · 2.1 KB
/
Location.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class LocationDataset(DatasetBase):
def __init__(self, **kwargs):
super(LocationDataset, self).__init__(**kwargs)
@staticmethod
def metadata():
return {
"language": "ar",
"citation": """@inproceedings{mubarak2021ul2c,
title={{UL2C}: Mapping user locations to countries on Arabic Twitter},
author={Mubarak, Hamdy and Hassan, Sabit},
booktitle={Proceedings of the Sixth Arabic Natural Language Processing Workshop},
pages={145--153},
year={2021}
}""",
"link": "https://alt.qcri.org/resources/UL2C-UserLocationsToCountries.tsv",
"splits": {
"test": "arab+others.txt",
"train": "arab+others_dev.txt",
},
"task_type": TaskType.Classification,
"class_labels": [
"ae",
"OTHERS",
"bh",
"dz",
"eg",
"iq",
"jo",
"kw",
"lb",
"ly",
"ma",
"om",
"ps",
"qa",
"sa",
"sd",
"so",
"sy",
"tn",
"UNK",
"ye",
"mr",
],
}
@staticmethod
def get_data_sample():
return {"input": "Doha, Qatar", "label": "QA"}
def load_data(self, data_path, no_labels=False):
data_path = self.resolve_path(data_path)
# Format: location \t country_code
data = []
with open(data_path, "r") as fp:
for line_idx, line in enumerate(fp):
location, country = line.split("\t")
location = location.strip()
country = country.strip()
data.append(
{"input": location, "label": country, "line_number": line_idx}
)
return data