-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_matrix_feature_of_UF_matrices.py
executable file
·201 lines (158 loc) · 7.51 KB
/
get_matrix_feature_of_UF_matrices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import sys
import pickle
import math
assert(len(sys.argv) == 2)
output_file_name = sys.argv[1]
UF_DIR = "/home/duzhen/matrix_suite"
# 删除目录下的所有文件夹
os.system("cd " + UF_DIR + " && find . -type d -exec rm -r {} +")
cur_path = os.getcwd()
# 将个别矩阵的文件名和矩阵实际名字的映射建立起来
with open(cur_path + "/repeat_matrix_name", 'rb') as f:
file_2_matrix_name_map = pickle.load(f)
print(file_2_matrix_name_map)
existing_matrix_file_name_set = set()
# 遍历所有的目录中所有矩阵文件的名字
for file_name in os.listdir(UF_DIR):
matrix_name = file_name[0:-7]
existing_matrix_file_name_set.add(matrix_name)
print(len(existing_matrix_file_name_set))
# 用一个集合来整理已经记录feature的matrix文件名,在后面的统计中就可以忽略掉
exist_matrix_name_in_feature_file_set = set()
# 已经处理过的文件
if os.path.exists(output_file_name):
os.system("echo " + output_file_name + " is existing")
for line in open(output_file_name, "r"):
line = line.strip()
if line != "":
line_str_arr = line.split(",")
exist_matrix_name_in_feature_file_set.add(line_str_arr[0])
# print(exist_matrix_name_in_feature_file_set)
# matrix_feature_file.write("goddardRocketProblem_1,831,831,4457,0.0064541727667794735,5.363417569193743,1,161,114.95336690024438,30.01817365941216,830"+ "\n")
# exit(-1)
# 这里补一下
# existing_matrix_file_name_set = set()
# existing_matrix_file_name_set.add("kmer_V2a")
# existing_matrix_file_name_set.add("fem_hifreq_circuit")
# existing_matrix_file_name_set.add("plantstexture_10NN")
# 这里读一个矩阵,kmer_V2a还没有处理
# 出现错误的矩阵fem_hifreq_circuit,plantstexture_10NN
for matrix_file_name in existing_matrix_file_name_set:
# 已经分析过了就下一个
if matrix_file_name in exist_matrix_name_in_feature_file_set:
os.system("echo " + matrix_file_name + " is existing")
continue
# 解压
os.system("cd " + UF_DIR + " && tar -zxvf " + matrix_file_name + ".tar.gz")
os.system("echo finish tar")
matrix_name = matrix_file_name
# 实际的矩阵名字可能不一样
if matrix_name in file_2_matrix_name_map.keys():
matrix_name = file_2_matrix_name_map[matrix_name]
# 打开对应文件
# matrix_file = open(UF_DIR + "/" + matrix_name + "/" + matrix_name + ".mtx", "r")
# 所有的特征,包含:
# matrix,rows,cols,nnz,density,avr_nnz_row,min_nnz_row,max_nnz_row,var_nnz_row,ell_padding_ratio,empty_rows
a_line_of_feature_data = matrix_file_name
row_num_of_matrix = 0
col_num_of_matrix = 0
nnz = 0
row_length_list = []
is_first_line = True
row_index_set = set()
# 如果矩阵的大小大于60M,就忽视这个矩阵
is_ignored_matrix = False
for line in open(UF_DIR + "/" + matrix_name + "/" + matrix_name + ".mtx", "r"):
# 用一个set来存储所有的row_index,以此来存储空行
# 读文件中的内容,然后获取特征
# 首先是略过所有的注释
# 每一行的非零元数量
if line[0] != "%":
# 查看第一行,包含行数量,列数量和nnz
if is_first_line == True:
line_str_arr = line.split()
row_num_of_matrix = eval(line_str_arr[0])
col_num_of_matrix = eval(line_str_arr[1])
nnz = eval(line_str_arr[2])
os.system("echo " + matrix_file_name + " nnz:" + str(nnz))
if nnz >= 60000000:
os.system("echo " + matrix_file_name + ":too large to handle, nnz:" + str(nnz))
is_ignored_matrix = True
break
a_line_of_feature_data = a_line_of_feature_data + \
"," + str(row_num_of_matrix)
a_line_of_feature_data = a_line_of_feature_data + \
"," + str(col_num_of_matrix)
a_line_of_feature_data = a_line_of_feature_data + \
"," + str(nnz)
is_first_line = False
row_length_list = [0] * row_num_of_matrix
else:
line_str_arr = line.split()
row = eval(line_str_arr[0])
row_index_set.add(row)
col = eval(line_str_arr[1])
if row < 1:
os.system("echo " + matrix_file_name + ":row index too small error, row:" + str(row))
is_ignored_matrix = True
break
if col < 1:
os.system("echo " + matrix_file_name + ":col index too small error, col:" + str(col))
is_ignored_matrix = True
break
# 如果这里有错误,就先跳过这里
if row > row_num_of_matrix:
os.system("echo " + matrix_file_name + ":row index too large error, row:" + str(row) + ", row_num_of_matrix:" + str(row_num_of_matrix))
is_ignored_matrix = True
break
if col > col_num_of_matrix:
os.system("echo " + matrix_file_name + ":col index too large error, col:" + str(col) + ", col_num_of_matrix:" + str(col_num_of_matrix))
is_ignored_matrix = True
break
row_length_list[row - 1] = row_length_list[row - 1] + 1
if is_ignored_matrix == True:
continue
# 计算空行的数量
empty_line_num = row_num_of_matrix - len(row_index_set)
# 计算密度,用非零元数量/(行数量*列数量)
matrix_density = nnz / (row_num_of_matrix * col_num_of_matrix)
# 平均行数量
avg_row_length = nnz / row_num_of_matrix
# 最大行数量,最小行数量,方差
row_length_variance = 0
min_row_length = 0
max_row_length = 0
is_first_element = True
# 遍历每一行的行长度
for row_len in row_length_list:
if is_first_element == True:
min_row_length = row_len
max_row_length = row_len
is_first_element = False
else:
if row_len < min_row_length:
min_row_length = row_len
if row_len > max_row_length:
max_row_length = row_len
row_length_variance = row_length_variance + math.pow(row_len - avg_row_length, 2)
assert(max_row_length >= min_row_length)
row_length_variance = row_length_variance / row_num_of_matrix
ell_padding_rate = max_row_length * row_num_of_matrix / nnz
a_line_of_feature_data = a_line_of_feature_data + "," + str(matrix_density)
a_line_of_feature_data = a_line_of_feature_data + "," + str(avg_row_length)
a_line_of_feature_data = a_line_of_feature_data + "," + str(min_row_length)
a_line_of_feature_data = a_line_of_feature_data + "," + str(max_row_length)
a_line_of_feature_data = a_line_of_feature_data + "," + str(row_length_variance)
a_line_of_feature_data = a_line_of_feature_data + "," + str(ell_padding_rate)
a_line_of_feature_data = a_line_of_feature_data + "," + str(empty_line_num)
# python一次只能打开一个文件句柄
matrix_feature_file = open(output_file_name, "a+")
# 打印对应的一条数据
# print(a_line_of_feature_data)
matrix_feature_file.write(a_line_of_feature_data+"\n")
# print(a_line_of_feature_data)
matrix_feature_file.close()
# 删除产生的数据
os.system("cd " + UF_DIR + " && find . -type d -exec rm -r {} +")
matrix_feature_file.close()