diff --git a/MiFeMoDEP/SourceCode/complete_pipeline_MiFeMoDEP_for_single_input.py b/MiFeMoDEP/SourceCode/complete_pipeline_MiFeMoDEP_for_single_input.py new file mode 100644 index 0000000..f83ce1a --- /dev/null +++ b/MiFeMoDEP/SourceCode/complete_pipeline_MiFeMoDEP_for_single_input.py @@ -0,0 +1,133 @@ +from transformers import AutoTokenizer, AutoModel +from sklearn.preprocessing import MinMaxScaler +from networkx.drawing.nx_pydot import read_dot +from get_CodeBERT_embeddings import get_CodeBERT_context_embeddings +from train_PCA_for_MiFeMoDEP import PCA_single_input +from RGCN import RGCN +from rgcn_training import get_graph_data +from LIME_for_MiFeMoDEP import preprocess_feature_from_explainer, add_agg_scr_to_list +import pickle +import dill +import subprocess +import torch, os +import numpy as np, pandas as pd + +filepath = './single.py' +code_string = open(filepath, 'r').read() + +# get the CodeBERT embeddings and perform PCA on them +cb_tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base') +cb_model = AutoModel.from_pretrained('microsoft/codebert-base') +max_len = 10000 +pca = pickle.load(open('./MiFeMoDEP_PCA.pkl', 'rb')) +num_features = 1450000 + +cb_embeds = get_CodeBERT_context_embeddings(cb_tokenizer, cb_model, code_string, max_len) +cb_embeds = cb_embeds.detach().numpy().reshape(1, -1) +cb_embeds = PCA_single_input(pca, cb_embeds, num_features) + +# extract the Code Property Graph using JOERN and encode it using an RGCN +in_channels = 150 +hidden_channels = 100 +out_channels = 50 +num_relations = 21 +truncate_size = 2000 +RGCN_model = RGCN(in_channels, hidden_channels, out_channels, num_relations) +RGCN_model_weights = torch.load('./MiFeMoDEP_SourceCode_CPG_Enc.pt') +RGCN_model.load_state_dict(RGCN_model_weights) + +graph_dir_path = "" +def extract_graph_from_file(filename, output_name): + result = subprocess.run(["joern-parse", filename], capture_output=True) + output = result.stdout.decode() + if result.returncode == 0: + subprocess.run(["joern-export", "--repr=all", "--out", graph_dir_path+output_name]) + + +if os.path.isdir("./test_graph"): + if os.path.exists("./test_graph/export.dot"): + os.remove("./test_graph/export.dot") + os.rmdir("./test_graph") + +extract_graph_from_file(filepath,"test_graph") + +graph_path = graph_dir_path+"test_graph/export.dot" + +graph = read_dot(graph_path) +node_features, edge_index, edge_types = get_graph_data(graph,truncate_size,padding=True) # includes Doc2Vec for node embedding +cpg_embeds = RGCN_model(node_features, edge_index.to(torch.int64), edge_types.to(torch.int64)).detach().numpy().reshape(1, 128) + +# combine the embeddings and pass it to an RF Classifier +embeds = np.vstack((cb_embeds, cpg_embeds)).reshape(1, 128*2) # changed to add two 128 , 128 embedds +clf = pickle.load(open('./MiFeMoDEP_SourceCode_RF.pkl', 'rb')) +y_pred = clf.predict(embeds) +if y_pred == 1: + # use LIME explainer + explainer = dill.load(open('LIME_for_MiFeMoDEP.pkl', 'rb')) + exp = explainer.explain_instance( + embeds.reshape(256), + clf.predict_proba, + num_features=256, + top_labels=1, + num_samples=5000 + ) + + top_k_tokens = np.arange(10,201,10) + agg_methods = ['avg','median','sum'] + max_str_len_list = 100 + max_tokens = 100 + line_score_df_col_name = ['total_tokens', 'line_level_label', 'line_num'] + ['token'+str(i) for i in range(1,max_str_len_list+1)] + [agg+'-top-'+str(k)+'-tokens' for agg in agg_methods for k in top_k_tokens] + [agg+'-all-tokens' for agg in agg_methods] + + line_score_df = pd.DataFrame(columns=line_score_df_col_name) + line_score_df = line_score_df.set_index('line_num') + + sorted_feature_score_dict, tokens_list = preprocess_feature_from_explainer(exp) + + code_lines = code_string.splitlines() + for line_num, line in enumerate(code_lines): + if type(line) == float: # nan + line = "" + + line_stuff = [] + line_score_list = np.zeros(max_tokens) + token_list = line.split()[:max_tokens] + line_stuff.append(line) + line_stuff.append(len(token_list)) + + for tok_idx, tok in enumerate(token_list): + score = sorted_feature_score_dict.get(tok, 0) + line_score_list[tok_idx] = score + + line_stuff = line_stuff + list(line_score_list) + + for k in top_k_tokens: + top_tokens = tokens_list[:k-1] + top_k_scr_list = [] + + if len(token_list) < 1: + top_k_scr_list.append(0) + else: + for tok in token_list: + score = 0 + if tok in top_tokens: + score = sorted_feature_score_dict.get(tok,0) + top_k_scr_list.append(score) + + add_agg_scr_to_list(line_stuff, top_k_scr_list) + + add_agg_scr_to_list(line_stuff, list(line_score_list[:len(token_list)])) + line_score_df.loc[line_num] = line_stuff + line_score_df.to_csv('./single_df.csv') + scr_df = line_score_df['median-all-tokens'].values.tolist() + + scaler = MinMaxScaler() + line_score = scaler.fit_transform(np.array(scr_df).reshape(-1, 1)) + line_df = pd.DataFrame() + line_df['scr'] = [float(val.item()) for val in line_score] + line_df = line_df.sort_values(by='scr',ascending=True) + + buggy_order = [idx+1 for idx,row in line_df.iterrows()] + print('The possible buggy lines in order of most to least probable:\n',buggy_order) + +else: + print('No buggy lines') \ No newline at end of file diff --git a/MiFeMoDEP/SourceCode/rgcn_training.py b/MiFeMoDEP/SourceCode/rgcn_training.py new file mode 100644 index 0000000..9aa8cd1 --- /dev/null +++ b/MiFeMoDEP/SourceCode/rgcn_training.py @@ -0,0 +1,208 @@ +import time +import os, pickle +import torch +import torch.nn.functional as F +import pandas as pd +from torchvision import transforms +import numpy as np +from torch import nn +from networkx.drawing.nx_pydot import read_dot + +from gensim.models.doc2vec import Doc2Vec + +from torch_geometric.data import Dataset,Data +from torch_geometric.nn import FastRGCNConv, RGCNConv,SAGPooling +from torch_geometric.utils import k_hop_subgraph + +# Define device for computations (CPU or GPU) +# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +# print("device: ",device) + +path_to_doc2vec = "./" +doc2vec_model = Doc2Vec.load(path_to_doc2vec+"doc2vec_model_nodes_from_graphs.bin") + +e_types = ['RECEIVER', 'CDG', 'CFG', 'CONDITION', 'BINDS', 'REACHING_DEF', 'PARAMETER_LINK', 'IS_CALL_FOR_IMPORT', 'POST_DOMINATE', 'AST', 'CALL', 'REF', 'CONTAINS', 'INHERITS_FROM', 'TAGGED_BY', 'SOURCE_FILE', 'ARGUMENT', 'CAPTURE', 'DOMINATE', 'EVAL_TYPE'] +def get_edge_num(edge_type): + for i in range(20): + if(edge_type == e_types[i]): + return i + return 20 + +def get_graph_data(graph, truncate_size, padding=False): + edges = graph.edges(data=True) + nodes = graph.nodes(data=True) + mapping = {} + dictionary_list = [] + i=0 + for node in nodes: + if i >= truncate_size: + break + num,dic = node + # print(num, end=',') + mapping[num] = i + dictionary_list.append(doc2vec_model.infer_vector([str(dic)])) + i += 1 + nodes_features = torch.Tensor(np.array(dictionary_list)) + # changed_nodes = [x for x in range(i)] + # changed_nodes = torch.Tensor(np.array(changed_nodes)).int() + # print(changed_nodes) + # edge_idx,edge_type = get_edges(edges) + # Computing edges + edge_index_1 = [] + edge_index_2 = [] + edge_type = [] + for edge in edges: + t0,t1,t2 = edge + # print(t0, t1, sep=',', end = '|') + if t0 not in mapping or t1 not in mapping: + continue + edge_index_1.append(int(mapping[t0])) + edge_index_2.append(int(mapping[t1])) + edge_type.append(get_edge_num(t2['label'])) + edge_index = [edge_index_1,edge_index_2] + edge_idx,edge_type = torch.Tensor(np.array(edge_index)).to(torch.int64),torch.Tensor(np.array(edge_type)).to(torch.int64) + # print(edge_idx) + # nodes_features = extract_features(nodes) + while(padding and i < truncate_size): + # changed_nodes = torch.cat((changed_nodes,torch.Tensor([i]).int()),0) + nodes_features = torch.cat((nodes_features,torch.Tensor([0 for x in range(150)]).view(1,-1)),0) + i += 1 + + return nodes_features, edge_idx, edge_type + +if __name__ == "__main__": + + truncate_size = 2000 + + class RGCNDataset(Dataset): + def __init__(self, edge_index_list=[], edge_types_list=[], node_features_list=[], labels_list=[]): + super(RGCNDataset, self).__init__() + self.edge_index_list = edge_index_list + self.node_features_list = node_features_list + self.edge_types_list = edge_types_list + self.labels_list = labels_list + + def __len__(self): + return len(self.edge_index_list) # Length based on number of graphs + + def __getitem__(self, idx): + edge_index = self.edge_index_list[idx] + edge_types = self.edge_types_list[idx] + node_features = self.node_features_list[idx] + labels = self.labels_list[idx] + return edge_index, edge_types, node_features, labels + + in_channels = 150 + hidden_channels = 100 + out_channels = 50 + num_relations = 21 + truncate_size = 2000 + + edge_index_list = [0]*500 + edge_types_list = [0]*500 + node_features_list = [0]*500 + labels_list = [0]*500 + + test_df = pd.read_csv("../Documents/cs21b059/MiFeMoDEP/balanced_500_test_source_code.csv") + + def get_node_and_edge_encodings(): + root_dir_graphs = "./preprocess_graphs_500/" + count = 0 + for root,dirs,files in os.walk(root_dir_graphs): + for gfile in files: + start = time.time() + path = os.path.join(root,gfile) + name = root.replace(root_dir_graphs,'') + i = int(name) + graph = read_dot(path) + node_features,edge_index,edge_types = get_graph_data(graph,truncate_size,padding=True) + label = test_df['target'][i] + edge_index_list[i] = edge_index + node_features_list[i] = node_features + edge_types_list[i] = edge_types + labels_list[i] = label + end = time.time() + print("Time taken for one graph: ",end-start,"sec") + count += 1 + + with open('test_edge_index.pkl', 'wb') as f: + pickle.dump(edge_index_list, f) + + with open('test_edge_types_list.pkl', 'wb') as f: + pickle.dump(edge_types_list, f) + + with open('test_node_features_list.pkl', 'wb') as f: + pickle.dump(node_features_list, f) + + with open('test_labels_list.pkl', 'wb') as f: + pickle.dump(labels_list, f) + + print(count) + + def train_RGCN(): + edge_index_list = pickle.load(open('./edge_index.pkl', 'rb')) + edge_types_list = pickle.load(open('./edge_types_list.pkl', 'rb')) + node_features_list = pickle.load(open('./node_features_list.pkl', 'rb')) + labels_list = pickle.load(open('./labels_list.pkl', 'rb')) + + class rgcn_2000_nodes(torch.nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, num_relations): + super().__init__() + self.in_channels = in_channels + self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations) + self.sag_pool = SAGPooling(hidden_channels, ratio=0.8) + self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations) + self.lin = nn.Linear(out_channels*1600, 128) + + def forward(self, x, edge_index, edge_type): + x = self.conv1(x, edge_index, edge_type) + x, edge_index, edge_type, _, _, _ = self.sag_pool(x, edge_index, edge_type) + x = self.conv2(x, edge_index, edge_type) + x = x.view(x.size(0)*x.size(1)) + x = self.lin(x) + return x + + class NNClassifier(nn.Module): + def __init__(self): + super(NNClassifier, self).__init__() + + self.l1 = nn.Linear(128, 64) + self.l2 = nn.Linear(64, 1) + self.leakyrelu = nn.LeakyReLU() + self.sigmoid = nn.Sigmoid() + self.rgcn_model = rgcn_2000_nodes(in_channels, hidden_channels, out_channels, num_relations) + + def forward(self, node_features, edge_index, edge_types): + x = self.rgcn_model(node_features, edge_index, edge_types) + x = self.l1(x) + x = self.leakyrelu(x) + x = self.l2(x) + pred = self.sigmoid(x) + return pred + + learning_rate = 0.001 + num_epochs = 100 + + rgcn_dataset = RGCNDataset(edge_index_list,edge_types_list,node_features_list,labels_list) + model = NNClassifier() + loss_fn = nn.BCELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + for epoch in range(num_epochs): + start = time.time() + for i in range(rgcn_dataset.__len__()): + pred = model(node_features_list[i],edge_index_list[i].to(torch.int64),edge_types_list[i].to(torch.int64)) + loss = loss_fn(pred, torch.Tensor([labels_list[i]])) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if i%100 == 0: + print(i) + torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt") + torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt') + + print(f"Epoch {epoch} --> {time.time()-start}") + torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt") + torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt')