-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxlmr_large_zero.py
130 lines (114 loc) · 4.89 KB
/
xlmr_large_zero.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import pandas as pd
import torch
torch.cuda.empty_cache()
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaForMultipleChoice, XLMRobertaTokenizer, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
# Set the computation device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for computation.")
class MultipleChoiceDataset(Dataset):
def __init__(self, df, tokenizer, max_length=128):
self.df = df
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
record = self.df.iloc[idx]
choices_inputs = self.tokenizer(
text=[record['startphrase']] * 2,
text_pair=[record['ending1'], record['ending2']],
add_special_tokens=True,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = choices_inputs['input_ids'].squeeze(0)
attention_mask = choices_inputs['attention_mask'].squeeze(0)
labels = torch.tensor(record['labels'], dtype=torch.long)
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels
}
def evaluate(model, dataloader):
model.eval()
predictions = []
true_labels = []
for batch in tqdm(dataloader, desc="Evaluating"):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
logits = outputs.logits
batch_predictions = torch.argmax(logits, dim=1).tolist()
predictions.extend(batch_predictions)
true_labels.extend(batch['labels'].tolist())
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='macro')
recall = recall_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='macro')
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
}
return predictions, metrics
def main(df_train, df_test, model_name, lang, k, num_epochs=20, save_model_flag=True):
model_path = "model_large.pkl"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
# Check if the model file exists
if os.path.exists(model_path):
print("Loading model from saved file.")
model = torch.load(model_path)
model.to(device)
else:
print("Model file not found. Training new model.")
model = XLMRobertaForMultipleChoice.from_pretrained(model_name)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
train_dataset = MultipleChoiceDataset(df_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Training loop
for epoch in range(num_epochs):
model.train()
total_loss = 0
for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
batch = {k: v.to(device) for k, v in batch.items()}
optimizer.zero_grad()
outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Training loss: {total_loss / len(train_dataloader)}")
if save_model_flag:
torch.save(model, model_path)
print(f"Model saved to {model_path}")
# Evaluation on the test set
test_dataset = MultipleChoiceDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)
test_predictions, metrics = evaluate(model, test_dataloader)
# Save metrics to file
os.makedirs("outputs", exist_ok=True)
metrics_path = f"outputs/metrics_large_{lang}_{k}.txt"
with open(metrics_path, "w") as f:
for key, value in metrics.items():
f.write(f"{key.capitalize()}: {value}\n")
print(f"Metrics saved to {metrics_path}")
# Append predictions to test dataframe and save
df_test['predictions'] = test_predictions
df_test.to_csv(f"outputs/with_predictions_large_{lang}_{k}.txt", index=False)
print(f"Test dataframe with predictions saved to outputs/with_predictions_large_{lang}_{k}.txt")
if __name__ == "__main__":
# Placeholder paths for your dataset files
df_train = pd.read_csv("test_data/en_train.csv")
df_test = pd.read_csv("test_data/yo.csv")
lang = "yo"
k = "0"
model_name = "xlm-roberta-large"
main(df_train, df_test, model_name, lang, k)