forked from thainq107/llms_qlora
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
36 lines (30 loc) · 1.01 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import json
from datasets import load_dataset
id2label = {
'0': 'negative',
'1': 'positive'
}
def load_data_from_datahub(
dataset_name,
save_data_dir,
prompt=""
):
raw_dataset = load_dataset(dataset_name)
for data_type in raw_dataset:
examples = raw_dataset[data_type]
sentences = []
for _, sentence in enumerate(examples["text"]):
sentence = prompt + sentence + ". Answer:"
sentences.append(sentence)
labels = examples["feeling"]
labels = [id2label[str(label)] for label in labels]
save_data_file = os.path.join(save_data_dir, f"{data_type}.jsonl")
print(f"Write into ... {save_data_file}")
with open(save_data_file, "w") as f:
for sentence, label in zip(sentences, labels):
data = {
"sentence": sentence,
"label": label
}
print(json.dumps(data, ensure_ascii=False), file=f)