-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
133 changes: 133 additions & 0 deletions
133
MiFeMoDEP/SourceCode/complete_pipeline_MiFeMoDEP_for_single_input.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from transformers import AutoTokenizer, AutoModel | ||
from sklearn.preprocessing import MinMaxScaler | ||
from networkx.drawing.nx_pydot import read_dot | ||
from get_CodeBERT_embeddings import get_CodeBERT_context_embeddings | ||
from train_PCA_for_MiFeMoDEP import PCA_single_input | ||
from RGCN import RGCN | ||
from rgcn_training import get_graph_data | ||
from LIME_for_MiFeMoDEP import preprocess_feature_from_explainer, add_agg_scr_to_list | ||
import pickle | ||
import dill | ||
import subprocess | ||
import torch, os | ||
import numpy as np, pandas as pd | ||
|
||
filepath = './single.py' | ||
code_string = open(filepath, 'r').read() | ||
|
||
# get the CodeBERT embeddings and perform PCA on them | ||
cb_tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base') | ||
cb_model = AutoModel.from_pretrained('microsoft/codebert-base') | ||
max_len = 10000 | ||
pca = pickle.load(open('./MiFeMoDEP_PCA.pkl', 'rb')) | ||
num_features = 1450000 | ||
|
||
cb_embeds = get_CodeBERT_context_embeddings(cb_tokenizer, cb_model, code_string, max_len) | ||
cb_embeds = cb_embeds.detach().numpy().reshape(1, -1) | ||
cb_embeds = PCA_single_input(pca, cb_embeds, num_features) | ||
|
||
# extract the Code Property Graph using JOERN and encode it using an RGCN | ||
in_channels = 150 | ||
hidden_channels = 100 | ||
out_channels = 50 | ||
num_relations = 21 | ||
truncate_size = 2000 | ||
RGCN_model = RGCN(in_channels, hidden_channels, out_channels, num_relations) | ||
RGCN_model_weights = torch.load('./MiFeMoDEP_SourceCode_CPG_Enc.pt') | ||
RGCN_model.load_state_dict(RGCN_model_weights) | ||
|
||
graph_dir_path = "" | ||
def extract_graph_from_file(filename, output_name): | ||
result = subprocess.run(["joern-parse", filename], capture_output=True) | ||
output = result.stdout.decode() | ||
if result.returncode == 0: | ||
subprocess.run(["joern-export", "--repr=all", "--out", graph_dir_path+output_name]) | ||
|
||
|
||
if os.path.isdir("./test_graph"): | ||
if os.path.exists("./test_graph/export.dot"): | ||
os.remove("./test_graph/export.dot") | ||
os.rmdir("./test_graph") | ||
|
||
extract_graph_from_file(filepath,"test_graph") | ||
|
||
graph_path = graph_dir_path+"test_graph/export.dot" | ||
|
||
graph = read_dot(graph_path) | ||
node_features, edge_index, edge_types = get_graph_data(graph,truncate_size,padding=True) # includes Doc2Vec for node embedding | ||
cpg_embeds = RGCN_model(node_features, edge_index.to(torch.int64), edge_types.to(torch.int64)).detach().numpy().reshape(1, 128) | ||
|
||
# combine the embeddings and pass it to an RF Classifier | ||
embeds = np.vstack((cb_embeds, cpg_embeds)).reshape(1, 128*2) # changed to add two 128 , 128 embedds | ||
clf = pickle.load(open('./MiFeMoDEP_SourceCode_RF.pkl', 'rb')) | ||
y_pred = clf.predict(embeds) | ||
if y_pred == 1: | ||
# use LIME explainer | ||
explainer = dill.load(open('LIME_for_MiFeMoDEP.pkl', 'rb')) | ||
exp = explainer.explain_instance( | ||
embeds.reshape(256), | ||
clf.predict_proba, | ||
num_features=256, | ||
top_labels=1, | ||
num_samples=5000 | ||
) | ||
|
||
top_k_tokens = np.arange(10,201,10) | ||
agg_methods = ['avg','median','sum'] | ||
max_str_len_list = 100 | ||
max_tokens = 100 | ||
line_score_df_col_name = ['total_tokens', 'line_level_label', 'line_num'] + ['token'+str(i) for i in range(1,max_str_len_list+1)] + [agg+'-top-'+str(k)+'-tokens' for agg in agg_methods for k in top_k_tokens] + [agg+'-all-tokens' for agg in agg_methods] | ||
|
||
line_score_df = pd.DataFrame(columns=line_score_df_col_name) | ||
line_score_df = line_score_df.set_index('line_num') | ||
|
||
sorted_feature_score_dict, tokens_list = preprocess_feature_from_explainer(exp) | ||
|
||
code_lines = code_string.splitlines() | ||
for line_num, line in enumerate(code_lines): | ||
if type(line) == float: # nan | ||
line = "" | ||
|
||
line_stuff = [] | ||
line_score_list = np.zeros(max_tokens) | ||
token_list = line.split()[:max_tokens] | ||
line_stuff.append(line) | ||
line_stuff.append(len(token_list)) | ||
|
||
for tok_idx, tok in enumerate(token_list): | ||
score = sorted_feature_score_dict.get(tok, 0) | ||
line_score_list[tok_idx] = score | ||
|
||
line_stuff = line_stuff + list(line_score_list) | ||
|
||
for k in top_k_tokens: | ||
top_tokens = tokens_list[:k-1] | ||
top_k_scr_list = [] | ||
|
||
if len(token_list) < 1: | ||
top_k_scr_list.append(0) | ||
else: | ||
for tok in token_list: | ||
score = 0 | ||
if tok in top_tokens: | ||
score = sorted_feature_score_dict.get(tok,0) | ||
top_k_scr_list.append(score) | ||
|
||
add_agg_scr_to_list(line_stuff, top_k_scr_list) | ||
|
||
add_agg_scr_to_list(line_stuff, list(line_score_list[:len(token_list)])) | ||
line_score_df.loc[line_num] = line_stuff | ||
line_score_df.to_csv('./single_df.csv') | ||
scr_df = line_score_df['median-all-tokens'].values.tolist() | ||
|
||
scaler = MinMaxScaler() | ||
line_score = scaler.fit_transform(np.array(scr_df).reshape(-1, 1)) | ||
line_df = pd.DataFrame() | ||
line_df['scr'] = [float(val.item()) for val in line_score] | ||
line_df = line_df.sort_values(by='scr',ascending=True) | ||
|
||
buggy_order = [idx+1 for idx,row in line_df.iterrows()] | ||
print('The possible buggy lines in order of most to least probable:\n',buggy_order) | ||
|
||
else: | ||
print('No buggy lines') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
import time | ||
import os, pickle | ||
import torch | ||
import torch.nn.functional as F | ||
import pandas as pd | ||
from torchvision import transforms | ||
import numpy as np | ||
from torch import nn | ||
from networkx.drawing.nx_pydot import read_dot | ||
|
||
from gensim.models.doc2vec import Doc2Vec | ||
|
||
from torch_geometric.data import Dataset,Data | ||
from torch_geometric.nn import FastRGCNConv, RGCNConv,SAGPooling | ||
from torch_geometric.utils import k_hop_subgraph | ||
|
||
# Define device for computations (CPU or GPU) | ||
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||
# print("device: ",device) | ||
|
||
path_to_doc2vec = "./" | ||
doc2vec_model = Doc2Vec.load(path_to_doc2vec+"doc2vec_model_nodes_from_graphs.bin") | ||
|
||
e_types = ['RECEIVER', 'CDG', 'CFG', 'CONDITION', 'BINDS', 'REACHING_DEF', 'PARAMETER_LINK', 'IS_CALL_FOR_IMPORT', 'POST_DOMINATE', 'AST', 'CALL', 'REF', 'CONTAINS', 'INHERITS_FROM', 'TAGGED_BY', 'SOURCE_FILE', 'ARGUMENT', 'CAPTURE', 'DOMINATE', 'EVAL_TYPE'] | ||
def get_edge_num(edge_type): | ||
for i in range(20): | ||
if(edge_type == e_types[i]): | ||
return i | ||
return 20 | ||
|
||
def get_graph_data(graph, truncate_size, padding=False): | ||
edges = graph.edges(data=True) | ||
nodes = graph.nodes(data=True) | ||
mapping = {} | ||
dictionary_list = [] | ||
i=0 | ||
for node in nodes: | ||
if i >= truncate_size: | ||
break | ||
num,dic = node | ||
# print(num, end=',') | ||
mapping[num] = i | ||
dictionary_list.append(doc2vec_model.infer_vector([str(dic)])) | ||
i += 1 | ||
nodes_features = torch.Tensor(np.array(dictionary_list)) | ||
# changed_nodes = [x for x in range(i)] | ||
# changed_nodes = torch.Tensor(np.array(changed_nodes)).int() | ||
# print(changed_nodes) | ||
# edge_idx,edge_type = get_edges(edges) | ||
# Computing edges | ||
edge_index_1 = [] | ||
edge_index_2 = [] | ||
edge_type = [] | ||
for edge in edges: | ||
t0,t1,t2 = edge | ||
# print(t0, t1, sep=',', end = '|') | ||
if t0 not in mapping or t1 not in mapping: | ||
continue | ||
edge_index_1.append(int(mapping[t0])) | ||
edge_index_2.append(int(mapping[t1])) | ||
edge_type.append(get_edge_num(t2['label'])) | ||
edge_index = [edge_index_1,edge_index_2] | ||
edge_idx,edge_type = torch.Tensor(np.array(edge_index)).to(torch.int64),torch.Tensor(np.array(edge_type)).to(torch.int64) | ||
# print(edge_idx) | ||
# nodes_features = extract_features(nodes) | ||
while(padding and i < truncate_size): | ||
# changed_nodes = torch.cat((changed_nodes,torch.Tensor([i]).int()),0) | ||
nodes_features = torch.cat((nodes_features,torch.Tensor([0 for x in range(150)]).view(1,-1)),0) | ||
i += 1 | ||
|
||
return nodes_features, edge_idx, edge_type | ||
|
||
if __name__ == "__main__": | ||
|
||
truncate_size = 2000 | ||
|
||
class RGCNDataset(Dataset): | ||
def __init__(self, edge_index_list=[], edge_types_list=[], node_features_list=[], labels_list=[]): | ||
super(RGCNDataset, self).__init__() | ||
self.edge_index_list = edge_index_list | ||
self.node_features_list = node_features_list | ||
self.edge_types_list = edge_types_list | ||
self.labels_list = labels_list | ||
|
||
def __len__(self): | ||
return len(self.edge_index_list) # Length based on number of graphs | ||
|
||
def __getitem__(self, idx): | ||
edge_index = self.edge_index_list[idx] | ||
edge_types = self.edge_types_list[idx] | ||
node_features = self.node_features_list[idx] | ||
labels = self.labels_list[idx] | ||
return edge_index, edge_types, node_features, labels | ||
|
||
in_channels = 150 | ||
hidden_channels = 100 | ||
out_channels = 50 | ||
num_relations = 21 | ||
truncate_size = 2000 | ||
|
||
edge_index_list = [0]*500 | ||
edge_types_list = [0]*500 | ||
node_features_list = [0]*500 | ||
labels_list = [0]*500 | ||
|
||
test_df = pd.read_csv("../Documents/cs21b059/MiFeMoDEP/balanced_500_test_source_code.csv") | ||
|
||
def get_node_and_edge_encodings(): | ||
root_dir_graphs = "./preprocess_graphs_500/" | ||
count = 0 | ||
for root,dirs,files in os.walk(root_dir_graphs): | ||
for gfile in files: | ||
start = time.time() | ||
path = os.path.join(root,gfile) | ||
name = root.replace(root_dir_graphs,'') | ||
i = int(name) | ||
graph = read_dot(path) | ||
node_features,edge_index,edge_types = get_graph_data(graph,truncate_size,padding=True) | ||
label = test_df['target'][i] | ||
edge_index_list[i] = edge_index | ||
node_features_list[i] = node_features | ||
edge_types_list[i] = edge_types | ||
labels_list[i] = label | ||
end = time.time() | ||
print("Time taken for one graph: ",end-start,"sec") | ||
count += 1 | ||
|
||
with open('test_edge_index.pkl', 'wb') as f: | ||
pickle.dump(edge_index_list, f) | ||
|
||
with open('test_edge_types_list.pkl', 'wb') as f: | ||
pickle.dump(edge_types_list, f) | ||
|
||
with open('test_node_features_list.pkl', 'wb') as f: | ||
pickle.dump(node_features_list, f) | ||
|
||
with open('test_labels_list.pkl', 'wb') as f: | ||
pickle.dump(labels_list, f) | ||
|
||
print(count) | ||
|
||
def train_RGCN(): | ||
edge_index_list = pickle.load(open('./edge_index.pkl', 'rb')) | ||
edge_types_list = pickle.load(open('./edge_types_list.pkl', 'rb')) | ||
node_features_list = pickle.load(open('./node_features_list.pkl', 'rb')) | ||
labels_list = pickle.load(open('./labels_list.pkl', 'rb')) | ||
|
||
class rgcn_2000_nodes(torch.nn.Module): | ||
def __init__(self, in_channels, hidden_channels, out_channels, num_relations): | ||
super().__init__() | ||
self.in_channels = in_channels | ||
self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations) | ||
self.sag_pool = SAGPooling(hidden_channels, ratio=0.8) | ||
self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations) | ||
self.lin = nn.Linear(out_channels*1600, 128) | ||
|
||
def forward(self, x, edge_index, edge_type): | ||
x = self.conv1(x, edge_index, edge_type) | ||
x, edge_index, edge_type, _, _, _ = self.sag_pool(x, edge_index, edge_type) | ||
x = self.conv2(x, edge_index, edge_type) | ||
x = x.view(x.size(0)*x.size(1)) | ||
x = self.lin(x) | ||
return x | ||
|
||
class NNClassifier(nn.Module): | ||
def __init__(self): | ||
super(NNClassifier, self).__init__() | ||
|
||
self.l1 = nn.Linear(128, 64) | ||
self.l2 = nn.Linear(64, 1) | ||
self.leakyrelu = nn.LeakyReLU() | ||
self.sigmoid = nn.Sigmoid() | ||
self.rgcn_model = rgcn_2000_nodes(in_channels, hidden_channels, out_channels, num_relations) | ||
|
||
def forward(self, node_features, edge_index, edge_types): | ||
x = self.rgcn_model(node_features, edge_index, edge_types) | ||
x = self.l1(x) | ||
x = self.leakyrelu(x) | ||
x = self.l2(x) | ||
pred = self.sigmoid(x) | ||
return pred | ||
|
||
learning_rate = 0.001 | ||
num_epochs = 100 | ||
|
||
rgcn_dataset = RGCNDataset(edge_index_list,edge_types_list,node_features_list,labels_list) | ||
model = NNClassifier() | ||
loss_fn = nn.BCELoss() | ||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) | ||
|
||
for epoch in range(num_epochs): | ||
start = time.time() | ||
for i in range(rgcn_dataset.__len__()): | ||
pred = model(node_features_list[i],edge_index_list[i].to(torch.int64),edge_types_list[i].to(torch.int64)) | ||
loss = loss_fn(pred, torch.Tensor([labels_list[i]])) | ||
|
||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() | ||
|
||
if i%100 == 0: | ||
print(i) | ||
torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt") | ||
torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt') | ||
|
||
print(f"Epoch {epoch} --> {time.time()-start}") | ||
torch.save(model.state_dict(), "./MiFeMoDEP_SourceCode.pt") | ||
torch.save(model.rgcn_model.state_dict(), './MiFeMoDEP_SourceCode_PDG_Enc.pt') |