forked from qcri/LLMeBench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ArSarcasm.py
59 lines (51 loc) · 2.15 KB
/
ArSarcasm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import csv
from llmebench.datasets.dataset_base import DatasetBase
from llmebench.tasks import TaskType
class ArSarcasmDataset(DatasetBase):
def __init__(self, **kwargs):
super(ArSarcasmDataset, self).__init__(**kwargs)
@staticmethod
def metadata():
return {
"language": "ar",
"citation": """@inproceedings{abu-farha-magdy-2020-arabic,
title = "From {A}rabic Sentiment Analysis to Sarcasm Detection: The {A}r{S}arcasm Dataset",
author = "Abu Farha, Ibrahim and Magdy, Walid",
booktitle = "Proceedings of the 4th Workshop on Open-Source Arabic Corpora and Processing Tools, with a Shared Task on Offensive Language Detection",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resource Association",
url = "https://www.aclweb.org/anthology/2020.osact-1.5",
pages = "32--39",
language = "English",
ISBN = "979-10-95546-51-1",
}""",
"link": "https://github.com/iabufarha/ArSarcasm",
"license": "MIT License",
"splits": {
"test": "ArSarcasm_test.csv",
"train": "ArSarcasm_train.csv",
},
"task_type": TaskType.Classification,
"class_labels": ["TRUE", "FALSE"],
}
@staticmethod
def get_data_sample():
return {"input": "A tweet", "label": "TRUE"}
def load_data(self, data_path):
data_path = self.resolve_path(data_path)
data = []
with open(data_path, "r", encoding="utf-8") as fp:
reader = csv.DictReader(fp)
for line_idx, row in enumerate(reader):
data.append(
{
"input": row["tweet"],
"label": row[
"sarcasm"
].upper(), # To get it to work on ArSarcasm (True/False) and ArSarcasm-2 (TRUE/FALSE)
"line_number": line_idx,
}
)
return data