diff --git a/src/Native/test/benchmark_test/benchmark_ntt.py b/src/Native/test/benchmark_test/benchmark_ntt.py
index 36aa4e183..eb710489a 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt.py
+++ b/src/Native/test/benchmark_test/benchmark_ntt.py
@@ -12,7 +12,6 @@
def kpu_targets():
return os.getenv('KPU_TARGETS', "").split(',')
-
def nuc_ip():
return os.getenv('NUC_PROXY_IP')
@@ -20,32 +19,31 @@ def nuc_ip():
def nuc_port():
return os.getenv('NUC_PROXY_PORT')
-
def ntt_report_file(default: str):
return os.getenv('BENCHMARK_NTT_REPORT_FILE', default)
+def ntt_matmul_x86_64_report_file(default: str):
+ return os.getenv('BENCHMARK_NTT_MATMUL_X86_64_REPORT_FILE', default)
-def ntt_matmul_report_file(default: str):
- return os.getenv('BENCHMARK_NTT_MATMUL_REPORT_FILE', default)
-
+def ntt_matmul_riscv64_report_file(default: str):
+ return os.getenv('BENCHMARK_NTT_MATMUL_RISCV64_REPORT_FILE', default)
-def generate_benchmark_ntt_md(benchmark_list: list, md_file: str):
+def generate_benchmark_ntt_md(benchmark_list: list, key: str, md_file: str):
# generate dict after sorting
- benchmark_list = sorted(benchmark_list, key=lambda d: (d['kind'], d['op']))
dict = {}
for e in benchmark_list:
- kind = e['kind']
- if kind not in dict:
- dict[kind] = []
- dict[kind].append(e)
+ k = e[key]
+ if k not in dict:
+ dict[k] = []
+ dict[k].append(e)
# generate html table
md = '
\n'
# table head
md += '\t\n'
- for key in benchmark_list[0]:
- md += f'\t\t{key} | \n'
+ for k in benchmark_list[0]:
+ md += f'\t\t{k} | \n'
md += '\t
\n'
# table row
@@ -55,13 +53,13 @@ def generate_benchmark_ntt_md(benchmark_list: list, md_file: str):
md += '\t\n'
if i == 0:
for k, v in value[i].items():
- if k == 'kind':
+ if k == key:
md += f'\t\t{v} | \n'
else:
md += f'\t\t{v} | \n'
else:
for k, v in value[i].items():
- if k != 'kind':
+ if k != key:
md += f'\t\t{v} | \n'
md += '\t
\n'
@@ -70,60 +68,12 @@ def generate_benchmark_ntt_md(benchmark_list: list, md_file: str):
with open(md_file, 'w') as f:
f.write(md)
-
-def generate_benchmark_ntt_matmul_md(execute_path: str, md_file: str):
-
- cmd_status, cmd_result = subprocess.getstatusoutput(
- f'{execute_path}' + '/benchmark_ntt_primitive_size')
-
- lines = cmd_result.strip().split('\n')
-
- # Start of the HTML table
-
- md = '\n'
-
- # Table header
- md += '\t\n'
- md += '\t\tPackMode | \n'
- md += '\t\tM | \n'
- md += '\t\tK | \n'
- md += '\t\tN | \n'
- md += '\t\tCycles | \n'
- md += '\t\tGFLOPS | \n'
- md += '\t
\n'
-
- # Table rows
- for line in lines:
- parts = line.split(',')
- packmode = parts[0].strip()
- M = parts[1].split(':')[1].strip()
- K = parts[2].split(':')[1].strip()
- N = parts[3].split(':')[1].strip()
- cycles = parts[4].split(':')[1].strip()
- gflops = parts[5].split(':')[1].strip()
-
- md += '\t\n'
- md += f'\t\t{packmode} | \n'
- md += f'\t\t{M} | \n'
- md += f'\t\t{K} | \n'
- md += f'\t\t{N} | \n'
- md += f'\t\t{cycles} | \n'
- md += f'\t\t{gflops} | \n'
- md += '\t
\n'
-
- # End of the HTML table
- md += '
\n'
-
- with open(md_file, 'w') as f:
- f.write(md)
-
-
-class BenchmarkNTT():
- def __init__(self, arch: str, target: str, bin_path: str):
+class Benchmark():
+ def __init__(self, arch: str, target: str, bin_path: str, bin_prefix: str):
self.arch = arch
self.target = target
self.bin_path = bin_path
- self.bin_prefix = 'benchmark_ntt_'
+ self.bin_prefix = bin_prefix
self.bin_list = self.traverse_dir()
self.benchmark_list = []
@@ -133,6 +83,85 @@ def traverse_dir(self):
file_list.append(bin)
return file_list
+class Benchmark_riscv64():
+ def send_msg(self, sock, msg):
+ # Prefix each message with a 4-byte length (network byte order)
+ msg = struct.pack('>I', len(msg)) + msg
+ sock.sendall(msg)
+
+ def recv_msg(self, sock):
+ # Read message length and unpack it into an integer
+ raw_msglen = self.recvall(sock, 4)
+ if not raw_msglen:
+ return None
+ msglen = struct.unpack('>I', raw_msglen)[0]
+ # Read the message data
+ return self.recvall(sock, msglen)
+
+ def recvall(self, sock, n):
+ # Helper function to recv n bytes or return None if EOF is hit
+ data = bytearray()
+ while len(data) < n:
+ packet = sock.recv(n - len(data))
+ if not packet:
+ return None
+ data.extend(packet)
+ return data
+
+ def run_evb(self, bin):
+ # connect server
+ ip = nuc_ip()
+ port = nuc_port()
+ client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ client_socket.connect((ip, int(port)))
+
+ # send target
+ dummy = self.recv_msg(client_socket)
+ target_dict = {}
+ target_dict['target'] = self.target
+ self.send_msg(client_socket, json.dumps(target_dict).encode())
+
+ # send header
+ dummy = self.recv_msg(client_socket)
+ header_dict = {}
+ header_dict['case'] = os.path.basename(bin)
+ header_dict['app'] = 1
+ header_dict['kmodel'] = 0
+ header_dict['inputs'] = 0
+ header_dict['description'] = 0
+ header_dict['outputs'] = 0
+ header_dict['cfg_cmds'] = []
+ self.send_msg(client_socket, json.dumps(header_dict).encode())
+
+ # send app
+ dummy = self.recv_msg(client_socket)
+ file_dict = {}
+ file_dict['file_name'] = os.path.basename(bin)
+ file_dict['file_size'] = os.path.getsize(bin)
+ self.send_msg(client_socket, json.dumps(file_dict).encode())
+ dummy = self.recv_msg(client_socket)
+ with open(bin, 'rb') as f:
+ client_socket.sendall(f.read())
+
+ # get result
+ header_dict = {}
+ ret = self.recv_msg(client_socket)
+ header_dict = json.loads(ret.decode())
+ msg = header_dict['msg']
+ if header_dict['type'].find('finish') != -1:
+ self.send_msg(client_socket, f"recved msg".encode())
+ client_socket.close()
+ return 0, msg[0]
+ else:
+ client_socket.close()
+ raise Exception(msg)
+ return -1, msg
+
+
+class BenchmarkNTT(Benchmark):
+ def __init__(self, arch: str, target: str, bin_path: str):
+ Benchmark.__init__(self, arch, target, bin_path, 'benchmark_ntt_')
+
def parse_result(self, result: str):
lines = result.split('\n')
for line in lines:
@@ -145,7 +174,6 @@ def parse_result(self, result: str):
dict[f'{self.arch}_ratio'] = round(
float(dict[f'{self.arch}_roofline']) / (float(dict[f'{self.arch}_actual']) + 0.01), 3)
self.benchmark_list.append(dict)
-
def run():
pass
@@ -234,7 +262,7 @@ def run(self):
self.parse_result(cmd_result)
-class BenchmarkNTT_riscv64(BenchmarkNTT):
+class BenchmarkNTT_riscv64(BenchmarkNTT, Benchmark_riscv64):
def __init__(self, target: str, bin_path: str):
BenchmarkNTT.__init__(self, 'riscv64', target, bin_path)
self.roofline_dict = {'binary': {'add': '10.3',
@@ -311,78 +339,52 @@ def __init__(self, target: str, bin_path: str):
},
}
- def send_msg(self, sock, msg):
- # Prefix each message with a 4-byte length (network byte order)
- msg = struct.pack('>I', len(msg)) + msg
- sock.sendall(msg)
+ def run(self):
+ if self.target not in kpu_targets():
+ return
- def recv_msg(self, sock):
- # Read message length and unpack it into an integer
- raw_msglen = self.recvall(sock, 4)
- if not raw_msglen:
- return None
- msglen = struct.unpack('>I', raw_msglen)[0]
- # Read the message data
- return self.recvall(sock, msglen)
+ for bin in self.bin_list:
+ cmd_status, cmd_result = self.run_evb(bin)
+ assert (cmd_status == 0)
+ lines = cmd_result.split('\r\n')
+ new_lines = lines[1:-1]
+ new_cmd_result = '\n'.join(new_lines)
+ self.parse_result(new_cmd_result)
- def recvall(self, sock, n):
- # Helper function to recv n bytes or return None if EOF is hit
- data = bytearray()
- while len(data) < n:
- packet = sock.recv(n - len(data))
- if not packet:
- return None
- data.extend(packet)
- return data
+class BenchmarkNTTMatmul(Benchmark):
+ def __init__(self, arch: str, target: str, bin_path: str):
+ Benchmark.__init__(self, arch, target, bin_path, 'benchmark_ntt_matmul_primitive_size')
- def run_evb(self, bin):
- # connect server
- ip = nuc_ip()
- port = nuc_port()
- client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- client_socket.connect((ip, int(port)))
+ def parse_result(self, result: str):
+ lines = result.strip().split('\n')
+ for line in lines:
+ dict = {}
+ items = line.split(',')
+ dict['pack_mode'] = items[0].strip()
+ dict['M'] = items[1].split(':')[1].strip()
+ dict['K'] = items[2].split(':')[1].strip()
+ dict['N'] = items[3].split(':')[1].strip()
+ dict[f'{self.arch}_cycles'] = items[4].split(':')[1].strip()
+ dict[f'{self.arch}_gflops'] = items[5].split(':')[1].strip()
+ self.benchmark_list.append(dict)
- # send target
- dummy = self.recv_msg(client_socket)
- target_dict = {}
- target_dict['target'] = self.target
- self.send_msg(client_socket, json.dumps(target_dict).encode())
+ def run():
+ pass
- # send header
- dummy = self.recv_msg(client_socket)
- header_dict = {}
- header_dict['case'] = os.path.basename(bin)
- header_dict['app'] = 1
- header_dict['kmodel'] = 0
- header_dict['inputs'] = 0
- header_dict['description'] = 0
- header_dict['outputs'] = 0
- header_dict['cfg_cmds'] = []
- self.send_msg(client_socket, json.dumps(header_dict).encode())
- # send app
- dummy = self.recv_msg(client_socket)
- file_dict = {}
- file_dict['file_name'] = os.path.basename(bin)
- file_dict['file_size'] = os.path.getsize(bin)
- self.send_msg(client_socket, json.dumps(file_dict).encode())
- dummy = self.recv_msg(client_socket)
- with open(bin, 'rb') as f:
- client_socket.sendall(f.read())
+class BenchmarkNTTMatmul_x86_64(BenchmarkNTTMatmul):
+ def __init__(self, target: str, bin_path: str):
+ BenchmarkNTTMatmul.__init__(self, 'x86_64', target, bin_path)
- # get result
- header_dict = {}
- ret = self.recv_msg(client_socket)
- header_dict = json.loads(ret.decode())
- msg = header_dict['msg']
- if header_dict['type'].find('finish') != -1:
- self.send_msg(client_socket, f"recved msg".encode())
- client_socket.close()
- return 0, msg[0]
- else:
- client_socket.close()
- raise Exception(msg)
- return -1, msg
+ def run(self):
+ for bin in self.bin_list:
+ cmd_status, cmd_result = subprocess.getstatusoutput(f'{bin}')
+ assert (cmd_status == 0)
+ self.parse_result(cmd_result)
+
+class BenchmarkNTTMatmul_riscv64(BenchmarkNTTMatmul, Benchmark_riscv64):
+ def __init__(self, target: str, bin_path: str):
+ BenchmarkNTTMatmul.__init__(self, 'riscv64', target, bin_path)
def run(self):
if self.target not in kpu_targets():
@@ -392,11 +394,10 @@ def run(self):
cmd_status, cmd_result = self.run_evb(bin)
assert (cmd_status == 0)
lines = cmd_result.split('\r\n')
- new_lines = lines[1:-1]
+ new_lines = lines[2:-1]
new_cmd_result = '\n'.join(new_lines)
self.parse_result(new_cmd_result)
-
if __name__ == '__main__':
# parse
parser = argparse.ArgumentParser(prog="benchmark_ntt")
@@ -410,22 +411,38 @@ def run(self):
type=str, default='riscv64_build/bin')
args = parser.parse_args()
- # x86_64
+ # 1. benchmark ntt
+ # 1.1 x86_64
ntt_x86_64 = BenchmarkNTT_x86_64(args.x86_64_target, args.x86_64_path)
ntt_x86_64.run()
- # riscv64
+ # 1.2 riscv64
ntt_riscv64 = BenchmarkNTT_riscv64(args.riscv64_target, args.riscv64_path)
ntt_riscv64.run()
- # merge benchmark list
+ # 1.3 merge benchmark list
benchmark_list = []
for i in range(len(ntt_x86_64.benchmark_list)):
item = {**ntt_x86_64.benchmark_list[i], **ntt_riscv64.benchmark_list[i]}
benchmark_list.append(item)
- # generate md
- ntt_md = ntt_report_file('benchmark_ntt.md')
- ntt_matmul_md = ntt_matmul_report_file('benchmark_ntt_matmul.md')
- generate_benchmark_ntt_md(benchmark_list, ntt_md)
- generate_benchmark_ntt_matmul_md(ntt_x86_64.bin_path, ntt_matmul_md)
+ # 1.4 generate md
+ md_file = ntt_report_file('benchmark_ntt.md')
+ benchmark_list = sorted(benchmark_list, key=lambda d: (d['kind'], d['op']))
+ generate_benchmark_ntt_md(benchmark_list, 'kind', md_file)
+
+ # 2. benchmark ntt matmul
+ # 2.1 x86_64
+ benchmark_list = []
+ ntt_matmul_x86_64 = BenchmarkNTTMatmul_x86_64(args.x86_64_target, args.x86_64_path)
+ ntt_matmul_x86_64.run()
+ benchmark_list = sorted(ntt_matmul_x86_64.benchmark_list, key=lambda d: (d['pack_mode']))
+ md_file = ntt_matmul_x86_64_report_file('benchmark_ntt_matmul_x86_64.md')
+ generate_benchmark_ntt_md(benchmark_list, 'pack_mode', md_file)
+
+ # 2.2 riscv64
+ ntt_matmul_riscv64 = BenchmarkNTTMatmul_riscv64(args.riscv64_target, args.riscv64_path)
+ ntt_matmul_riscv64.run()
+ benchmark_list = sorted(ntt_matmul_riscv64.benchmark_list, key=lambda d: (d['pack_mode']))
+ md_file = ntt_matmul_riscv64_report_file('benchmark_ntt_matmul_riscv64.md')
+ generate_benchmark_ntt_md(benchmark_list, 'pack_mode', md_file)
\ No newline at end of file
diff --git a/src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp b/src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp
similarity index 98%
rename from src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp
rename to src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp
index 3af82354a..07a99eab8 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp
+++ b/src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp
@@ -31,13 +31,12 @@ template void benchmark_ntt_matmul_pack_NONE() {
asm volatile("" ::"g"(tc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M
<< ", K:" << K << ", N:" << N
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -74,13 +73,12 @@ template void benchmark_ntt_matmul_pack_K() {
asm volatile("" ::"g"(tc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M
<< ", K:" << K / P << ", N:" << N
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -115,13 +113,12 @@ template void benchmark_ntt_matmul_pack_M() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M / P
<< ", K:" << K << ", N:" << N
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -156,13 +153,12 @@ template void benchmark_ntt_matmul_pack_N() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M
<< ", K:" << K << ", N:" << N / P
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -201,13 +197,12 @@ template void benchmark_ntt_matmul_pack_M_N() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M / P
<< ", K:" << K << ", N:" << N / P
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -246,13 +241,12 @@ template void benchmark_ntt_matmul_pack_M_K() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M / P
<< ", K:" << K / P << ", N:" << N
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -291,13 +285,12 @@ template void benchmark_ntt_matmul_pack_K_N() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M
<< ", K:" << K / P << ", N:" << N / P
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -338,13 +331,12 @@ template void benchmark_ntt_matmul_pack_M_K_N() {
asm volatile("" ::"g"(pc));
auto ops = M * N * (2 * K - 1);
- auto freq = 3.2 * 1e9;
std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_"))
<< std::setprecision(0) << std::fixed << ", M:" << M / P
<< ", K:" << K / P << ", N:" << N / P
<< ", Cycles:" << static_cast(t2 - t1) / run_num
<< ", GFLOPS:"
- << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9
+ << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3
<< std::endl;
}
@@ -505,8 +497,6 @@ void matmul_primitive_analysis() {
int main() {
-#if __x86_64__
-
{
const auto PackMode = nncase::ntt::ukernels::mamtul_pack_kind::pack_m;
matmul_primitive_analysis();
@@ -542,7 +532,5 @@ int main() {
matmul_primitive_analysis();
}
-#endif
-
return 0;
}
\ No newline at end of file
diff --git a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
index 5a6d4b061..6eb3bbf25 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
+++ b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
@@ -43,6 +43,7 @@ std::string benchmark_ntt_reduce_add_reduceN_noPack() {
ntt::reduce_sum>(ta, tb[warmup_num + i]);
}
auto t2 = NttTest::get_cpu_cycle();
+ asm volatile("" ::"g"(tb));
std::ostringstream oss;
oss << module << "_" << reduce_mode << "_" << reduce_direction << "_"
diff --git a/src/Native/test/ntt_test.h b/src/Native/test/ntt_test.h
index 4eaccf9fb..845a0e975 100644
--- a/src/Native/test/ntt_test.h
+++ b/src/Native/test/ntt_test.h
@@ -35,6 +35,12 @@
#define ULP_SIZE 10000
#endif
+#if defined __x86_64
+#ifndef CPU_FREQUENCY_MHZ
+#define CPU_FREQUENCY_MHZ 4100
+#endif
+
+#elif defined __riscv
#ifndef CPU_FREQUENCY_MHZ
#define CPU_FREQUENCY_MHZ 1600
#endif
@@ -42,6 +48,7 @@
#ifndef CLOCK_SOURCE_FREQUENCY_MHZ
#define CLOCK_SOURCE_FREQUENCY_MHZ 27
#endif
+#endif
namespace nncase {
namespace NttTest {