Skip to content

Commit

Permalink
Update bert_colab.py
Browse files Browse the repository at this point in the history
  • Loading branch information
escrogar authored Nov 2, 2021
1 parent da7878f commit 393575d
Showing 1 changed file with 43 additions and 50 deletions.
93 changes: 43 additions & 50 deletions bert_colab.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,62 @@
from google.colab import drive
drive.mount('/content/gdrive')
corpus=pd.read_csv('gdrive/My Drive/etl.tsv', sep='\t')
# There are several ways of connecting files to a Google Colab (Google Drive or direct upload), this example shows using your own Google Drive account.
# These are the packages necessary for the script.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

warnings.simplefilter(action='ignore', category=FutureWarning) # sklearn prints many futurewarnings, this option suppresses these
import torch
from transformers import AutoTokenizer, AutoModel

import pandas as pd
import numpy as np
import os
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import json


INPUT = 'etl' # 'etl_nothree'
drive.mount('/content/gdrive')
corpus=pd.read_csv('gdrive/My Drive/corpus.tsv', sep='\t') # you need to provide your corpus file here

DIVISOR = 200 # 3 or 200 :)

# different setups might need different solutions here
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')
model = AutoModel.from_pretrained('SZTAKI-HLT/hubert-base-cc')
DIVISOR = 200 # batch length for the model - can be arbitrarily low if computational setup is weak
tokenizer = AutoTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc') # here you can change your preferred pretrained model
model = AutoModel.from_pretrained('SZTAKI-HLT/hubert-base-cc') # here you can change your preferred pretrained model

# tokenize corpus
# note: it does not check for length requirements at this time
# tokenize corpus
# note: it does not check for length requirements at this time

tokenized = corpus["text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print(tokenized)

# create padding and attention masks
# create padding and attention masks based on automatic token length measurement
max_len = 0
for i in tokenized.values:
if len(i) > max_len:
max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print(padded)

print(max_len)
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask)

# for computationally weaker setups, batch execution is the only way to process the texts
# manipulate floor divisor if a different batch size is needed

# for computationally weaker setups, batch execution is the only way to process the texts
# manipulate floor divisor if a different batch size is needed
batchsize = (len(corpus) // DIVISOR) + 1
print('Number of batches:', batchsize)
splitpadded = np.array_split(padded, batchsize)
splitmask = np.array_split(attention_mask, batchsize)


last_hidden_states = []
model = model.to(device)

DIMS = 768 # 768 at most (because of using BERT Base)
DIMS = 768 # 768 at most (because of using BERT Base, otherwise 1024 for Large models)

featuresfinal = np.empty((0, DIMS), dtype='float32')
features = np.empty((0, DIMS), dtype='float32')

# take batches of tokenized texts
# to extract BERT's last hidden states, i.e. contextual word embeddings
#
# XXX handling attention_mask was erroneous here,
# because array_split() gives variable length!
# now: zip() ensures that text and attention data is taken strictly in parallel
# take batches of tokenized texts
# to extract BERT's last hidden states, i.e. contextual word embeddings
#
# XXX handling attention_mask was erroneous here,
# because array_split() gives variable length!
# now: zip() ensures that text and attention data is taken strictly in parallel
for count, (batch, mask) in enumerate(zip(splitpadded, splitmask)):
batch_cnt = count + 1
print(f'Batch #{batch_cnt}')
Expand All @@ -70,47 +66,41 @@
mask_batch = torch.tensor(mask)
print('Batches established!')

# put data onto GPU
input_batch = input_batch.to(device)
mask_batch = mask_batch.to(device)
print('Lengths', input_batch.size(0), mask_batch.size(0))

# no_grad ensures there is no gradient update in the model,
# as we are not looking for recursive training here
# no_grad ensures there is no gradient update in the model,
# as we are not looking for recursive training here
with torch.no_grad():
print('Model is running on', model.device)
last_hidden_states = model(input_batch, attention_mask=mask_batch)
print('Hidden states created for batch', batch_cnt)

# tensor dimensions: 0=sents, 1=words, 2=coords

lhs = last_hidden_states[0][:, :, 0:DIMS].cpu().numpy()
lhs = last_hidden_states[0][:, :, 0:DIMS].numpy() # this part can be manipulated not to use means
features = np.mean(lhs, axis=1) # average above words

print(features.shape)

featuresfinal = np.append(featuresfinal, features, axis=0)

print('Finished with batch', batch_cnt)


# this snippet can save the tokens and labels to your drive, keeping them for later use
from google.colab import files
np.save("featuresfinal", featuresfinal)
!cp featuresfinal.npy "/content/gdrive/My Drive/"
np.save("labels", corpus["topik"])
!cp labels.npy "/content/gdrive/My Drive/"
featuresfinal = np.load("/content/gdrive/My Drive/featuresfinal.npy")

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# MinMax scaling is applied to the features
# MinMax scaling is applied to the features, as this helps (as usual with many ML applications)
scaler = MinMaxScaler()
featuresfinal = scaler.fit_transform(featuresfinal)

# the parameter space is defined below
# the parameter space is defined below - follow the readme on how to update these
C = [0.1, 1]
tol = [0.001, 0.005, 0.01]
weighting = ['balanced']
Expand All @@ -121,11 +111,9 @@
clasrep = list()
paramlist = list()



labels = corpus["topik"].to_numpy()
#labels = np.load("labels.npy")

# this is in essence a repeated k-fold cross validated Logistic Regression, with a list of dicts output
for i in range(3):
train_features, test_features, train_labels, test_labels = train_test_split(featuresfinal, labels, stratify=labels)
lr = LogisticRegression()
Expand All @@ -137,6 +125,7 @@
paramlist.append(lrmodel.best_params_)
print("Finished with run!")


keylist = list(clasrep[0].keys())
results = pd.DataFrame()

Expand All @@ -145,8 +134,11 @@

results.mean()

import json
# this normalizes the output list as if it were a json import
results = pd.io.json.json_normalize(param)
results.mean()

# you can save the classification reports and the best parameter lists as json files
MyFile = open('clasrep_bert.json', 'w')
json.dump(clasrep, MyFile)
MyFile.close()
Expand All @@ -155,5 +147,6 @@
json.dump(paramlist, MyFile)
MyFile.close()

# if you use Colab, it is mandatory to transfer json files to your Drive storage or save them to your computer
!cp clasrep_bert.json "/content/gdrive/My Drive/"
!cp param_bert.json "/content/gdrive/My Drive/"

0 comments on commit 393575d

Please sign in to comment.