Skip to content

Commit

Permalink
Added graph data creation and doc2vec training
Browse files Browse the repository at this point in the history
  • Loading branch information
CS21B008 committed Apr 26, 2024
1 parent 7e4e44c commit 69897df
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 0 deletions.
22 changes: 22 additions & 0 deletions MiFeMoDEP/SourceCode/doc2vec_training_for_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

path_to_node_data = "./"
node_df = pd.read_csv(path_to_node_data+"nodes_all_graphs.csv")

# Build the Doc2Vec model
num_docs = 42
start = time.time()
tagged_data = [TaggedDocument(words=_d, tags=[i]) for i, _d in enumerate(node_df['dictionary'][:num_docs])]
end = time.time()

print(f'Created tagged data in {end-start} seconds')

# train a Doc2Vec Model
epochs = 100

d2v_model = Doc2Vec(vector_size=150, window=5, min_count=2, epochs=epochs)
d2v_model.build_vocab(tagged_data)
d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=epochs)
d2v_model.save("./doc2vec_nodel_nodes_from_graphs.bin")
32 changes: 32 additions & 0 deletions MiFeMoDEP/SourceCode/node_data_creation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import time
import networkx as nx
from networkx.drawing.nx_pydot import read_dot,write_dot

import os

path_to = "./"
graph_root_dir = "Preprocessing_joern/preprocess_graphs_10000/"
file_path_list = []
node_num_list = []
dictionary_list = []

def write_to_df(path):
G = read_dot(path+"/export.dot")
nodes = G.nodes(data=True)
for node in nodes:
num,dic = node
file_path_list.append(path)
node_num_list.append(num)
dictionary_list.append(str(dic))

for root,dirs,files in os.walk(path_to+graph_root_dir):
for file in files:
path = os.path.join(root,file)
start = time.time()
write_to_df(os.path.dirname(path))
end = time.time()
print("time: ",end-start)

df = pd.DataFrame({'file_path':file_path_list,'node_num':node_num_list,'dictionary':dictionary_list})
df.to_csv('nodes_all_graphs.csv')
52 changes: 52 additions & 0 deletions MiFeMoDEP/SourceCode/preprocessing_joern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import subprocess
import pandas as pd

# #loading train dataframe
path_to_line_random = '.'
code_files_root = "./Preprocessing_joern_test/preprocess_files_500/"
graph_files_root = "./Preprocessing_joern_test/preprocess_graphs_500/"

# train_df = pd.read_parquet(f'{path_to_line_random}/train.parquet.gzip')
# train_df = train_df.reset_index(drop=True)

# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)

# train_df_1 = train_df[train_df['target'] == 1].sample(600, random_state=42)
# train_df_0 = train_df[train_df['target'] == 0].sample(600, random_state=42)
# print(len(train_df_0))
# print(len(train_df_1))

# # Combine the DataFrames
# train_df = pd.concat([train_df_1, train_df_0], ignore_index=True)
# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)
# content_list = train_df['content']
# train_df.drop('content',axis=1,inplace=True)
# train_df.to_csv("random_1200_balanced.csv")
# train_df['content'] = content_list.apply(lambda x : '' if x is None else x.decode("latin-1"))

train_df = pd.read_csv("balanced_500_test_source_code.csv")
train_df['content'] = train_df["content"].apply(lambda x : '' if x is None else x)
train_df["content"] = train_df["content"].apply(lambda x: '' if type(x)==float else x)

print(train_df["content"][327])

def extract_graph_from_file(filename, output_name):
global graph_files_root
result = subprocess.run(["joern-parse", filename], capture_output=True)
output = result.stdout.decode()
if result.returncode == 0:
subprocess.run(["joern-export", "--repr=all", "--out", graph_files_root+output_name])

def write_to_file(row,index):
filename = str(index)
content = row['content']
with open(code_files_root+filename+".py", "w") as f:
f.write(content)
f.close()

for index, row in train_df.iterrows():
write_to_file(row,index)

for index, row in train_df.iterrows():
name = str(index)
extract_graph_from_file(code_files_root+name+".py", name)

0 comments on commit 69897df

Please sign in to comment.