forked from CC2Vec/CC2Vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjit_padding.py
137 lines (127 loc) · 5.03 KB
/
jit_padding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
def padding_commit_file(data, max_file, max_line, max_length):
new_commits = list()
for commit in data:
new_commit = list()
if len(commit) == max_file:
new_commit = commit
elif len(commit) > max_file:
new_commit = commit[:max_file]
else:
num_added_file = max_file - len(commit)
new_files = list()
for i in range(num_added_file):
file = list()
for j in range(max_line):
file.append(('<NULL> ' * max_length).strip())
new_files.append(file)
new_commit = commit + new_files
new_commits.append(new_commit)
return new_commits
def padding_commit_code_line(data, max_line, max_length):
new_commits = list()
for commit in data:
new_files = list()
for file in commit:
new_file = file
if len(file) == max_line:
new_file = file
elif len(file) > max_line:
new_file = file[:max_line]
else:
num_added_line = max_line - len(file)
new_file = file
for i in range(num_added_line):
new_file.append(('<NULL> ' * max_length).strip())
new_files.append(new_file)
new_commits.append(new_files)
return new_commits
def padding_commit_code_length(data, max_length):
commits = list()
for commit in data:
new_commit = list()
for file in commit:
new_file = list()
for line in file:
new_line = padding_length(line, max_length=max_length)
new_file.append(new_line)
new_commit.append(new_file)
commits.append(new_commit)
return commits
def padding_length(line, max_length):
line_length = len(line.split())
if line_length < max_length:
return str(line + ' <NULL>' * (max_length - line_length)).strip()
elif line_length > max_length:
line_split = line.split()
return ' '.join([line_split[i] for i in range(max_length)])
else:
return line
def convert_msg_to_label(pad_msg, dict_msg):
nrows, ncols = pad_msg.shape
labels = list()
for i in range(nrows):
column = list(set(list(pad_msg[i, :])))
label = np.zeros(len(dict_msg))
for c in column:
label[c] = 1
labels.append(label)
return np.array(labels)
def mapping_dict_msg(pad_msg, dict_msg):
return np.array(
[np.array([dict_msg[w.lower()] if (w.lower() in dict_msg.keys() and w != '<NULL>') else dict_msg['<NULL>'] for w in line.split(' ')]) for line in pad_msg])
def mapping_dict_code(pad_code, dict_code):
new_pad_code = list()
for commit in pad_code:
new_files = list()
for file in commit:
new_file = list()
for line in file:
new_line = list()
for token in line.split(' '):
if token.lower() in dict_code.keys() and token != '<NULL>':
new_line.append(dict_code[token.lower()])
else:
new_line.append(dict_code['<NULL>'])
new_file.append(np.array(new_line))
new_file = np.array(new_file)
new_files.append(new_file)
new_files = np.array(new_files)
new_pad_code.append(new_files)
return np.array(new_pad_code)
def padding_commit_code(data, max_file, max_line, max_length):
padding_length = padding_commit_code_length(data=data, max_length=max_length)
padding_line = padding_commit_code_line(padding_length, max_line=max_line, max_length=max_length)
padding_file = padding_commit_file(data=padding_line, max_file=max_file, max_line=max_line, max_length=max_length)
return padding_file
def clean_and_reformat_code(data):
# remove empty lines in code; divide code to two part: added_code and removed_code
new_diff_added_code, new_diff_removed_code = list(), list()
for diff in data:
files = list()
for file in diff:
lines = file['added_code']
new_lines = list()
for line in lines:
if len(line.strip()) > 0:
#remove double whitespaces/newlines by joining
new_lines.append(" ".join(line.split()))
files.append(new_lines)
new_diff_added_code.append(files)
for diff in data:
files = list()
for file in diff:
lines = file['removed_code']
new_lines = list()
for line in lines:
if len(line.strip()) > 0:
# remove double whitespaces/newlines by joining
new_lines.append(" ".join(line.split()))
files.append(new_lines)
new_diff_removed_code.append(files)
return (new_diff_added_code, new_diff_removed_code)
def padding_message(data, max_length):
new_data = list()
for d in data:
new_data.append(padding_length(line=" ".join(d.split()), max_length=max_length))
return new_data