forked from qcri/LLMeBench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
OSACT4SubtaskA.py
65 lines (58 loc) · 3.12 KB
/
OSACT4SubtaskA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class OSACT4SubtaskADataset(DatasetBase):
def __init__(self, **kwargs):
super(OSACT4SubtaskADataset, self).__init__(**kwargs)
@staticmethod
def metadata():
return {
"language": "ar",
"citation": """
@inproceedings{zampieri-etal-2020-semeval,
title = "{S}em{E}val-2020 Task 12: Multilingual Offensive Language Identification in Social Media ({O}ffens{E}val 2020)",
author = {Zampieri, Marcos and
Nakov, Preslav and
Rosenthal, Sara and
Atanasova, Pepa and
Karadzhov, Georgi and
Mubarak, Hamdy and
Derczynski, Leon and
Pitenis, Zeses and
{\\c{C}}{\\"o}ltekin, {\\c{C}}a{\\u{g}}r{\\i}},
booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
month = dec,
year = "2020",
address = "Barcelona (online)",
publisher = "International Committee for Computational Linguistics",
url = "https://aclanthology.org/2020.semeval-1.188",
doi = "10.18653/v1/2020.semeval-1.188",
pages = "1425--1447",
abstract = "We present the results and the main findings of SemEval-2020 Task 12 on Multilingual Offensive Language Identification in Social Media (OffensEval-2020). The task included three subtasks corresponding to the hierarchical taxonomy of the OLID schema from OffensEval-2019, and it was offered in five languages: Arabic, Danish, English, Greek, and Turkish. OffensEval-2020 was one of the most popular tasks at SemEval-2020, attracting a large number of participants across all subtasks and languages: a total of 528 teams signed up to participate in the task, 145 teams submitted official runs on the test data, and 70 teams submitted system description papers.",
}
""",
"link": "https://edinburghnlp.inf.ed.ac.uk/workshops/OSACT4/",
"license": "CC BY 4.0",
"splits": {
"test": "OSACT2020-sharedTask-test-tweets-labels.txt",
"train": "OSACT2020-sharedTask-train_OFF.txt",
},
"task_type": TaskType.Classification,
"class_labels": ["OFF", "NOT_OFF"],
}
@staticmethod
def get_data_sample():
return {"input": "@USER يلا يا خوخة يا مهزئة ع دراستك", "label": "OFF"}
def load_data(self, data_path, no_labels=False):
data_path = self.resolve_path(data_path)
# Format: text \t offensive_label
data = []
with open(data_path, "r") as fp:
for line_idx, line in enumerate(fp):
splits = line.strip().split("\t")
if len(splits) < 2:
continue
label = splits[1].strip()
data.append(
{"input": splits[0], "label": label, "line_number": line_idx}
)
return data