Added graph data creation and doc2vec training

ChandradithyaJ · Apr 26, 2024 · 69897df · 69897df
1 parent 7e4e44c
commit 69897df
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 0 deletions.
diff --git a/MiFeMoDEP/SourceCode/doc2vec_training_for_nodes.py b/MiFeMoDEP/SourceCode/doc2vec_training_for_nodes.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import time
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+
+path_to_node_data = "./"
+node_df = pd.read_csv(path_to_node_data+"nodes_all_graphs.csv")
+
+# Build the Doc2Vec model
+num_docs = 42
+start = time.time()
+tagged_data = [TaggedDocument(words=_d, tags=[i]) for i, _d in enumerate(node_df['dictionary'][:num_docs])]
+end = time.time()
+
+print(f'Created tagged data in {end-start} seconds')
+
+# train a Doc2Vec Model
+epochs = 100
+
+d2v_model = Doc2Vec(vector_size=150, window=5, min_count=2, epochs=epochs)
+d2v_model.build_vocab(tagged_data)
+d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=epochs)
+d2v_model.save("./doc2vec_nodel_nodes_from_graphs.bin")
diff --git a/MiFeMoDEP/SourceCode/node_data_creation.py b/MiFeMoDEP/SourceCode/node_data_creation.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import time
+import networkx as nx
+from networkx.drawing.nx_pydot import read_dot,write_dot
+
+import os 
+
+path_to = "./"
+graph_root_dir = "Preprocessing_joern/preprocess_graphs_10000/"
+file_path_list = []
+node_num_list = []
+dictionary_list = []
+
+def write_to_df(path):
+    G = read_dot(path+"/export.dot")
+    nodes = G.nodes(data=True)
+    for node in nodes:
+        num,dic = node
+        file_path_list.append(path)
+        node_num_list.append(num)
+        dictionary_list.append(str(dic))
+
+for root,dirs,files in os.walk(path_to+graph_root_dir):
+    for file in files:
+        path = os.path.join(root,file)
+        start = time.time()
+        write_to_df(os.path.dirname(path))
+        end = time.time()
+        print("time: ",end-start)
+
+df = pd.DataFrame({'file_path':file_path_list,'node_num':node_num_list,'dictionary':dictionary_list})
+df.to_csv('nodes_all_graphs.csv')
diff --git a/MiFeMoDEP/SourceCode/preprocessing_joern.py b/MiFeMoDEP/SourceCode/preprocessing_joern.py
@@ -0,0 +1,52 @@
+import subprocess
+import pandas as pd
+
+# #loading train dataframe
+path_to_line_random = '.'
+code_files_root = "./Preprocessing_joern_test/preprocess_files_500/"
+graph_files_root = "./Preprocessing_joern_test/preprocess_graphs_500/"
+
+# train_df = pd.read_parquet(f'{path_to_line_random}/train.parquet.gzip')
+# train_df = train_df.reset_index(drop=True)
+
+# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)
+
+# train_df_1 = train_df[train_df['target'] == 1].sample(600, random_state=42)
+# train_df_0 = train_df[train_df['target'] == 0].sample(600, random_state=42)
+# print(len(train_df_0))
+# print(len(train_df_1))
+
+# # Combine the DataFrames
+# train_df = pd.concat([train_df_1, train_df_0], ignore_index=True)
+# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1)
+# content_list = train_df['content']
+# train_df.drop('content',axis=1,inplace=True)
+# train_df.to_csv("random_1200_balanced.csv")
+# train_df['content'] = content_list.apply(lambda x : '' if x is None else x.decode("latin-1"))
+
+train_df = pd.read_csv("balanced_500_test_source_code.csv")
+train_df['content'] = train_df["content"].apply(lambda x : '' if x is None else x)
+train_df["content"] = train_df["content"].apply(lambda x: '' if type(x)==float else x)
+
+print(train_df["content"][327])
+
+def extract_graph_from_file(filename, output_name):
+    global graph_files_root
+    result = subprocess.run(["joern-parse", filename], capture_output=True)
+    output = result.stdout.decode()
+    if result.returncode == 0:
+        subprocess.run(["joern-export", "--repr=all", "--out", graph_files_root+output_name]) 
+
+def write_to_file(row,index):
+    filename = str(index)
+    content = row['content']
+    with open(code_files_root+filename+".py", "w") as f:
+        f.write(content)
+    f.close()
+
+for index, row in train_df.iterrows():
+    write_to_file(row,index)
+
+for index, row in train_df.iterrows():
+    name = str(index)
+    extract_graph_from_file(code_files_root+name+".py", name)