-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathccv_test.py
125 lines (102 loc) · 4.21 KB
/
ccv_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set
# save paths to the available datasets
from typing import NamedTuple, List
class Dataset(NamedTuple):
"""
Interface for accessing data folders.
"""
title: str
preprocessed_folder: str
raw_folders: List[str]
SAMPLE_DATA = Dataset(
title = "sample_data",
preprocessed_folder = "../de-ID_data/preprocessed/sample_data/",
raw_folders = ["docs/Track1-de-indentification/PHI/"]
)
GOLD_1 = Dataset(
title = "gold_1",
preprocessed_folder = "../de-ID_data/preprocessed/gold_1/",
raw_folders = ["../de-ID_data/raw/training-PHI-Gold-Set1/"]
)
GOLD_FULL = Dataset(
title = "gold_full",
preprocessed_folder = "../de-ID_data/preprocessed/gold_full/",
raw_folders = ["../de-ID_data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)
GOLD_TEST = Dataset(
title = "gold_test",
preprocessed_folder = "../de-ID_data/preprocessed/gold_test/",
raw_folders = ["../de-ID_data/raw/testing-PHI-Gold-fixed/"]
)
DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST]
# pick dataset and define loading boolean
train_data = DATASETS[2]
# train_data = DATASETS[0]
test_data = DATASETS[3]
isLoading = True
# save paths to the available datasets
from typing import NamedTuple, List
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
X_train,y_train,X_train_words,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
X_train,y_train,X_train_words,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)
# load test set
if isLoading:
X_test,y_test,X_test_words,df_test = pp.create_test_set(test_data.preprocessed_folder,isLoading,test_data.title)
else:
X_test,y_test,X_test_words,df_test = pp.create_test_set(test_data.raw_folders,isLoading,test_data.title)
# import model stuff
from src.models.baseline import BaselineModel
from src.models.bilstm import BiLSTM
from src.models.bilstm_crf import BiLSTM_CRF
from src.models.transformer import Transformer
from src.models.transformer_crf import Transformer_CRF
from src.models.bilstm_chars import BiLSTM_Chars
from src.models.bilstm_chars_crf import BiLSTM_Chars_CRF
from pipeline.visualization import sample_output
from pipeline.train import train
from random import randint
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
from src.converter import get_label_positions, bio_to_i2d2
import xml.etree.ElementTree as ET
from typing import NamedTuple, List
# check if GPU is available
assert tf.test.is_built_with_cuda()
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num GPUs:", len(physical_devices))
tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
# build model
# model = BaselineModel(pp.vocab_size,pp.tag_size,pp.max_len)
# model = BiLSTM(pp.vocab_size,pp.tag_size,pp.max_len)
# model = BiLSTM_CRF(pp.vocab_size,pp.tag_size,pp.max_len)
# model = Transformer(pp.vocab_size,pp.tag_size,pp.max_len)
# model = Transformer_CRF(pp.vocab_size, pp.tag_size, pp.max_len)
# model = BiLSTM_Chars(pp.vocab_size, pp.tag_size, pp.max_len,pp.idx2word)
model = BiLSTM_Chars_CRF(pp.vocab_size, pp.tag_size, pp.max_len,pp.idx2word)
# configure checkpoints and checkpoint manager
checkpoint_dir = 'models/checkpoints/' + train_data.title + '/' + model.title + '/'
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=10)
# restore checkpoint
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print("Restored from {}".format(manager.latest_checkpoint))
# train
print("Training ",model.title)
losses = train(model,X_train,y_train,X_train_words,batch_size = 32, epochs=10, lr = 0.0005, sample_interval=10,manager=manager,pp=pp)
# sample a random output
sample_output(model,X_train,y_train, pp = pp,rand_idx=None)
# test model
from pipeline.test import test_to_i2d2
test_to_i2d2(model,df_test, pp, checkpoint, manager)