-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added graph data creation and doc2vec training
- Loading branch information
Showing
3 changed files
with
106 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import pandas as pd | ||
import time | ||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | ||
|
||
path_to_node_data = "./" | ||
node_df = pd.read_csv(path_to_node_data+"nodes_all_graphs.csv") | ||
|
||
# Build the Doc2Vec model | ||
num_docs = 42 | ||
start = time.time() | ||
tagged_data = [TaggedDocument(words=_d, tags=[i]) for i, _d in enumerate(node_df['dictionary'][:num_docs])] | ||
end = time.time() | ||
|
||
print(f'Created tagged data in {end-start} seconds') | ||
|
||
# train a Doc2Vec Model | ||
epochs = 100 | ||
|
||
d2v_model = Doc2Vec(vector_size=150, window=5, min_count=2, epochs=epochs) | ||
d2v_model.build_vocab(tagged_data) | ||
d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=epochs) | ||
d2v_model.save("./doc2vec_nodel_nodes_from_graphs.bin") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import pandas as pd | ||
import time | ||
import networkx as nx | ||
from networkx.drawing.nx_pydot import read_dot,write_dot | ||
|
||
import os | ||
|
||
path_to = "./" | ||
graph_root_dir = "Preprocessing_joern/preprocess_graphs_10000/" | ||
file_path_list = [] | ||
node_num_list = [] | ||
dictionary_list = [] | ||
|
||
def write_to_df(path): | ||
G = read_dot(path+"/export.dot") | ||
nodes = G.nodes(data=True) | ||
for node in nodes: | ||
num,dic = node | ||
file_path_list.append(path) | ||
node_num_list.append(num) | ||
dictionary_list.append(str(dic)) | ||
|
||
for root,dirs,files in os.walk(path_to+graph_root_dir): | ||
for file in files: | ||
path = os.path.join(root,file) | ||
start = time.time() | ||
write_to_df(os.path.dirname(path)) | ||
end = time.time() | ||
print("time: ",end-start) | ||
|
||
df = pd.DataFrame({'file_path':file_path_list,'node_num':node_num_list,'dictionary':dictionary_list}) | ||
df.to_csv('nodes_all_graphs.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import subprocess | ||
import pandas as pd | ||
|
||
# #loading train dataframe | ||
path_to_line_random = '.' | ||
code_files_root = "./Preprocessing_joern_test/preprocess_files_500/" | ||
graph_files_root = "./Preprocessing_joern_test/preprocess_graphs_500/" | ||
|
||
# train_df = pd.read_parquet(f'{path_to_line_random}/train.parquet.gzip') | ||
# train_df = train_df.reset_index(drop=True) | ||
|
||
# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1) | ||
|
||
# train_df_1 = train_df[train_df['target'] == 1].sample(600, random_state=42) | ||
# train_df_0 = train_df[train_df['target'] == 0].sample(600, random_state=42) | ||
# print(len(train_df_0)) | ||
# print(len(train_df_1)) | ||
|
||
# # Combine the DataFrames | ||
# train_df = pd.concat([train_df_1, train_df_0], ignore_index=True) | ||
# train_df['target'] = train_df['lines'].apply(lambda line : 0 if len(line) == 0 else 1) | ||
# content_list = train_df['content'] | ||
# train_df.drop('content',axis=1,inplace=True) | ||
# train_df.to_csv("random_1200_balanced.csv") | ||
# train_df['content'] = content_list.apply(lambda x : '' if x is None else x.decode("latin-1")) | ||
|
||
train_df = pd.read_csv("balanced_500_test_source_code.csv") | ||
train_df['content'] = train_df["content"].apply(lambda x : '' if x is None else x) | ||
train_df["content"] = train_df["content"].apply(lambda x: '' if type(x)==float else x) | ||
|
||
print(train_df["content"][327]) | ||
|
||
def extract_graph_from_file(filename, output_name): | ||
global graph_files_root | ||
result = subprocess.run(["joern-parse", filename], capture_output=True) | ||
output = result.stdout.decode() | ||
if result.returncode == 0: | ||
subprocess.run(["joern-export", "--repr=all", "--out", graph_files_root+output_name]) | ||
|
||
def write_to_file(row,index): | ||
filename = str(index) | ||
content = row['content'] | ||
with open(code_files_root+filename+".py", "w") as f: | ||
f.write(content) | ||
f.close() | ||
|
||
for index, row in train_df.iterrows(): | ||
write_to_file(row,index) | ||
|
||
for index, row in train_df.iterrows(): | ||
name = str(index) | ||
extract_graph_from_file(code_files_root+name+".py", name) |