Skip to content

Commit

Permalink
added RGCN training and pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
CS21B008 committed Apr 26, 2024
1 parent 69897df commit 0ea86dd
Show file tree
Hide file tree
Showing 2 changed files with 341 additions and 0 deletions.
133 changes: 133 additions & 0 deletions MiFeMoDEP/SourceCode/complete_pipeline_MiFeMoDEP_for_single_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import MinMaxScaler
from networkx.drawing.nx_pydot import read_dot
from get_CodeBERT_embeddings import get_CodeBERT_context_embeddings
from train_PCA_for_MiFeMoDEP import PCA_single_input
from RGCN import RGCN
from rgcn_training import get_graph_data
from LIME_for_MiFeMoDEP import preprocess_feature_from_explainer, add_agg_scr_to_list
import pickle
import dill
import subprocess
import torch, os
import numpy as np, pandas as pd

filepath = './single.py'
code_string = open(filepath, 'r').read()

# get the CodeBERT embeddings and perform PCA on them
cb_tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
cb_model = AutoModel.from_pretrained('microsoft/codebert-base')
max_len = 10000
pca = pickle.load(open('./MiFeMoDEP_PCA.pkl', 'rb'))
num_features = 1450000

cb_embeds = get_CodeBERT_context_embeddings(cb_tokenizer, cb_model, code_string, max_len)
cb_embeds = cb_embeds.detach().numpy().reshape(1, -1)
cb_embeds = PCA_single_input(pca, cb_embeds, num_features)

# extract the Code Property Graph using JOERN and encode it using an RGCN
in_channels = 150
hidden_channels = 100
out_channels = 50
num_relations = 21
truncate_size = 2000
RGCN_model = RGCN(in_channels, hidden_channels, out_channels, num_relations)
RGCN_model_weights = torch.load('./MiFeMoDEP_SourceCode_CPG_Enc.pt')
RGCN_model.load_state_dict(RGCN_model_weights)

graph_dir_path = ""
def extract_graph_from_file(filename, output_name):
result = subprocess.run(["joern-parse", filename], capture_output=True)
output = result.stdout.decode()
if result.returncode == 0:
subprocess.run(["joern-export", "--repr=all", "--out", graph_dir_path+output_name])


if os.path.isdir("./test_graph"):
if os.path.exists("./test_graph/export.dot"):
os.remove("./test_graph/export.dot")
os.rmdir("./test_graph")

extract_graph_from_file(filepath,"test_graph")

graph_path = graph_dir_path+"test_graph/export.dot"

graph = read_dot(graph_path)
node_features, edge_index, edge_types = get_graph_data(graph,truncate_size,padding=True) # includes Doc2Vec for node embedding
cpg_embeds = RGCN_model(node_features, edge_index.to(torch.int64), edge_types.to(torch.int64)).detach().numpy().reshape(1, 128)

# combine the embeddings and pass it to an RF Classifier
embeds = np.vstack((cb_embeds, cpg_embeds)).reshape(1, 128*2) # changed to add two 128 , 128 embedds
clf = pickle.load(open('./MiFeMoDEP_SourceCode_RF.pkl', 'rb'))
y_pred = clf.predict(embeds)
if y_pred == 1:
# use LIME explainer
explainer = dill.load(open('LIME_for_MiFeMoDEP.pkl', 'rb'))
exp = explainer.explain_instance(
embeds.reshape(256),
clf.predict_proba,
num_features=256,
top_labels=1,
num_samples=5000
)

top_k_tokens = np.arange(10,201,10)
agg_methods = ['avg','median','sum']
max_str_len_list = 100
max_tokens = 100
line_score_df_col_name = ['total_tokens', 'line_level_label', 'line_num'] + ['token'+str(i) for i in range(1,max_str_len_list+1)] + [agg+'-top-'+str(k)+'-tokens' for agg in agg_methods for k in top_k_tokens] + [agg+'-all-tokens' for agg in agg_methods]

line_score_df = pd.DataFrame(columns=line_score_df_col_name)
line_score_df = line_score_df.set_index('line_num')

sorted_feature_score_dict, tokens_list = preprocess_feature_from_explainer(exp)

code_lines = code_string.splitlines()
for line_num, line in enumerate(code_lines):
if type(line) == float: # nan
line = ""

line_stuff = []
line_score_list = np.zeros(max_tokens)
token_list = line.split()[:max_tokens]
line_stuff.append(line)
line_stuff.append(len(token_list))

for tok_idx, tok in enumerate(token_list):
score = sorted_feature_score_dict.get(tok, 0)
line_score_list[tok_idx] = score

line_stuff = line_stuff + list(line_score_list)

for k in top_k_tokens:
top_tokens = tokens_list[:k-1]
top_k_scr_list = []

if len(token_list) < 1:
top_k_scr_list.append(0)
else:
for tok in token_list:
score = 0
if tok in top_tokens:
score = sorted_feature_score_dict.get(tok,0)
top_k_scr_list.append(score)

add_agg_scr_to_list(line_stuff, top_k_scr_list)

add_agg_scr_to_list(line_stuff, list(line_score_list[:len(token_list)]))
line_score_df.loc[line_num] = line_stuff
line_score_df.to_csv('./single_df.csv')
scr_df = line_score_df['median-all-tokens'].values.tolist()

scaler = MinMaxScaler()
line_score = scaler.fit_transform(np.array(scr_df).reshape(-1, 1))
line_df = pd.DataFrame()
line_df['scr'] = [float(val.item()) for val in line_score]
line_df = line_df.sort_values(by='scr',ascending=True)

buggy_order = [idx+1 for idx,row in line_df.iterrows()]
print('The possible buggy lines in order of most to least probable:\n',buggy_order)

else:
print('No buggy lines')
208 changes: 208 additions & 0 deletions MiFeMoDEP/SourceCode/rgcn_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import time
import os, pickle
import torch
import torch.nn.functional as F
import pandas as pd
from torchvision import transforms
import numpy as np
from torch import nn
from networkx.drawing.nx_pydot import read_dot

from gensim.models.doc2vec import Doc2Vec

from torch_geometric.data import Dataset,Data
from torch_geometric.nn import FastRGCNConv, RGCNConv,SAGPooling
from torch_geometric.utils import k_hop_subgraph

# Define device for computations (CPU or GPU)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print("device: ",device)

path_to_doc2vec = "./"
doc2vec_model = Doc2Vec.load(path_to_doc2vec+"doc2vec_model_nodes_from_graphs.bin")

e_types = ['RECEIVER', 'CDG', 'CFG', 'CONDITION', 'BINDS', 'REACHING_DEF', 'PARAMETER_LINK', 'IS_CALL_FOR_IMPORT', 'POST_DOMINATE', 'AST', 'CALL', 'REF', 'CONTAINS', 'INHERITS_FROM', 'TAGGED_BY', 'SOURCE_FILE', 'ARGUMENT', 'CAPTURE', 'DOMINATE', 'EVAL_TYPE']
def get_edge_num(edge_type):
for i in range(20):
if(edge_type == e_types[i]):
return i
return 20

def get_graph_data(graph, truncate_size, padding=False):
edges = graph.edges(data=True)
nodes = graph.nodes(data=True)
mapping = {}
dictionary_list = []
i=0
for node in nodes:
if i >= truncate_size:
break
num,dic = node
# print(num, end=',')
mapping[num] = i
dictionary_list.append(doc2vec_model.infer_vector([str(dic)]))
i += 1
nodes_features = torch.Tensor(np.array(dictionary_list))
# changed_nodes = [x for x in range(i)]
# changed_nodes = torch.Tensor(np.array(changed_nodes)).int()
# print(changed_nodes)
# edge_idx,edge_type = get_edges(edges)
# Computing edges
edge_index_1 = []
edge_index_2 = []
edge_type = []
for edge in edges:
t0,t1,t2 = edge
# print(t0, t1, sep=',', end = '|')
if t0 not in mapping or t1 not in mapping:
continue
edge_index_1.append(int(mapping[t0]))
edge_index_2.append(int(mapping[t1]))
edge_type.append(get_edge_num(t2['label']))
edge_index = [edge_index_1,edge_index_2]
edge_idx,edge_type = torch.Tensor(np.array(edge_index)).to(torch.int64),torch.Tensor(np.array(edge_type)).to(torch.int64)
# print(edge_idx)
# nodes_features = extract_features(nodes)
while(padding and i < truncate_size):
# changed_nodes = torch.cat((changed_nodes,torch.Tensor([i]).int()),0)
nodes_features = torch.cat((nodes_features,torch.Tensor([0 for x in range(150)]).view(1,-1)),0)
i += 1

return nodes_features, edge_idx, edge_type

if __name__ == "__main__":

truncate_size = 2000

class RGCNDataset(Dataset):
def __init__(self, edge_index_list=[], edge_types_list=[], node_features_list=[], labels_list=[]):
super(RGCNDataset, self).__init__()
self.edge_index_list = edge_index_list
self.node_features_list = node_features_list
self.edge_types_list = edge_types_list
self.labels_list = labels_list

def __len__(self):
return len(self.edge_index_list) # Length based on number of graphs

def __getitem__(self, idx):
edge_index = self.edge_index_list[idx]
edge_types = self.edge_types_list[idx]
node_features = self.node_features_list[idx]
labels = self.labels_list[idx]
return edge_index, edge_types, node_features, labels

in_channels = 150
hidden_channels = 100
out_channels = 50
num_relations = 21
truncate_size = 2000

edge_index_list = [0]*500
edge_types_list = [0]*500
node_features_list = [0]*500
labels_list = [0]*500

test_df = pd.read_csv("../Documents/cs21b059/MiFeMoDEP/balanced_500_test_source_code.csv")

def get_node_and_edge_encodings():
root_dir_graphs = "./preprocess_graphs_500/"
count = 0
for root,dirs,files in os.walk(root_dir_graphs):
for gfile in files:
start = time.time()
path = os.path.join(root,gfile)
name = root.replace(root_dir_graphs,'')
i = int(name)
graph = read_dot(path)
node_features,edge_index,edge_types = get_graph_data(graph,truncate_size,padding=True)
label = test_df['target'][i]
edge_index_list[i] = edge_index
node_features_list[i] = node_features
edge_types_list[i] = edge_types
labels_list[i] = label
end = time.time()
print("Time taken for one graph: ",end-start,"sec")
count += 1

with open('test_edge_index.pkl', 'wb') as f:
pickle.dump(edge_index_list, f)

with open('test_edge_types_list.pkl', 'wb') as f:
pickle.dump(edge_types_list, f)

with open('test_node_features_list.pkl', 'wb') as f:
pickle.dump(node_features_list, f)

with open('test_labels_list.pkl', 'wb') as f:
pickle.dump(labels_list, f)

print(count)

def train_RGCN():
edge_index_list = pickle.load(open('./edge_index.pkl', 'rb'))
edge_types_list = pickle.load(open('./edge_types_list.pkl', 'rb'))
node_features_list = pickle.load(open('./node_features_list.pkl', 'rb'))
labels_list = pickle.load(open('./labels_list.pkl', 'rb'))

class rgcn_2000_nodes(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
super().__init__()
self.in_channels = in_channels
self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations)
self.sag_pool = SAGPooling(hidden_channels, ratio=0.8)
self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations)
self.lin = nn.Linear(out_channels*1600, 128)

def forward(self, x, edge_index, edge_type):
x = self.conv1(x, edge_index, edge_type)
x, edge_index, edge_type, _, _, _ = self.sag_pool(x, edge_index, edge_type)
x = self.conv2(x, edge_index, edge_type)
x = x.view(x.size(0)*x.size(1))
x = self.lin(x)
return x

class NNClassifier(nn.Module):
def __init__(self):
super(NNClassifier, self).__init__()

self.l1 = nn.Linear(128, 64)
self.l2 = nn.Linear(64, 1)
self.leakyrelu = nn.LeakyReLU()
self.sigmoid = nn.Sigmoid()
self.rgcn_model = rgcn_2000_nodes(in_channels, hidden_channels, out_channels, num_relations)

def forward(self, node_features, edge_index, edge_types):
x = self.rgcn_model(node_features, edge_index, edge_types)
x = self.l1(x)
x = self.leakyrelu(x)
x = self.l2(x)
pred = self.sigmoid(x)
return pred

learning_rate = 0.001
num_epochs = 100

rgcn_dataset = RGCNDataset(edge_index_list,edge_types_list,node_features_list,labels_list)
model = NNClassifier()
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
start = time.time()
for i in range(rgcn_dataset.__len__()):
pred = model(node_features_list[i],edge_index_list[i].to(torch.int64),edge_types_list[i].to(torch.int64))
loss = loss_fn(pred, torch.Tensor([labels_list[i]]))

optimizer.zero_grad()
loss.backward()
optimizer.step()

if i%100 == 0:
print(i)
torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt")
torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt')

print(f"Epoch {epoch} --> {time.time()-start}")
torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt")
torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt')

0 comments on commit 0ea86dd

Please sign in to comment.