forked from ThilinaRajapakse/BERT_binary_text_classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
converter.py
77 lines (57 loc) · 2.55 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
'''
If you are having trouble multiprocessing inside Notebooks, give this script a shot.
'''
import torch
from tqdm import tqdm
from pytorch_pretrained_bert import BertTokenizer
from multiprocessing import Pool, cpu_count
import pickle
from tools import *
import convert_examples_to_features
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "data/"
# Bert pre-trained model selected in the list: bert-base-uncased,
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-cased'
# The name of the task to train.
TASK_NAME = 'yelp'
# The output directory where the model predictions and checkpoints will be written.
OUTPUT_DIR = f'outputs/{TASK_NAME}/'
CACHE_DIR = 'cache/'
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128
TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'
CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
output_mode = OUTPUT_MODE
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)
label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, output_mode) for example in train_examples]
process_count = cpu_count() - 2
# Running time on Ryzen 7 2700x with these settings is about 1 hour
if __name__ == '__main__':
print(f'Preparing to convert {train_examples_len} examples..')
print(f'Spawning {process_count} processes..')
with Pool(process_count) as p:
train_features = list(tqdm(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))
with open("train_features.pkl", 'wb') as f:
pickle.dump(train_features, f)