diff --git a/src/Native/test/benchmark_test/benchmark_ntt.py b/src/Native/test/benchmark_test/benchmark_ntt.py index 36aa4e183..eb710489a 100644 --- a/src/Native/test/benchmark_test/benchmark_ntt.py +++ b/src/Native/test/benchmark_test/benchmark_ntt.py @@ -12,7 +12,6 @@ def kpu_targets(): return os.getenv('KPU_TARGETS', "").split(',') - def nuc_ip(): return os.getenv('NUC_PROXY_IP') @@ -20,32 +19,31 @@ def nuc_ip(): def nuc_port(): return os.getenv('NUC_PROXY_PORT') - def ntt_report_file(default: str): return os.getenv('BENCHMARK_NTT_REPORT_FILE', default) +def ntt_matmul_x86_64_report_file(default: str): + return os.getenv('BENCHMARK_NTT_MATMUL_X86_64_REPORT_FILE', default) -def ntt_matmul_report_file(default: str): - return os.getenv('BENCHMARK_NTT_MATMUL_REPORT_FILE', default) - +def ntt_matmul_riscv64_report_file(default: str): + return os.getenv('BENCHMARK_NTT_MATMUL_RISCV64_REPORT_FILE', default) -def generate_benchmark_ntt_md(benchmark_list: list, md_file: str): +def generate_benchmark_ntt_md(benchmark_list: list, key: str, md_file: str): # generate dict after sorting - benchmark_list = sorted(benchmark_list, key=lambda d: (d['kind'], d['op'])) dict = {} for e in benchmark_list: - kind = e['kind'] - if kind not in dict: - dict[kind] = [] - dict[kind].append(e) + k = e[key] + if k not in dict: + dict[k] = [] + dict[k].append(e) # generate html table md = '\n' # table head md += '\t\n' - for key in benchmark_list[0]: - md += f'\t\t\n' + for k in benchmark_list[0]: + md += f'\t\t\n' md += '\t\n' # table row @@ -55,13 +53,13 @@ def generate_benchmark_ntt_md(benchmark_list: list, md_file: str): md += '\t\n' if i == 0: for k, v in value[i].items(): - if k == 'kind': + if k == key: md += f'\t\t\n' else: md += f'\t\t\n' else: for k, v in value[i].items(): - if k != 'kind': + if k != key: md += f'\t\t\n' md += '\t\n' @@ -70,60 +68,12 @@ def generate_benchmark_ntt_md(benchmark_list: list, md_file: str): with open(md_file, 'w') as f: f.write(md) - -def generate_benchmark_ntt_matmul_md(execute_path: str, md_file: str): - - cmd_status, cmd_result = subprocess.getstatusoutput( - f'{execute_path}' + '/benchmark_ntt_primitive_size') - - lines = cmd_result.strip().split('\n') - - # Start of the HTML table - - md = '
{key}{k}
{v}{v}{v}
\n' - - # Table header - md += '\t\n' - md += '\t\t\n' - md += '\t\t\n' - md += '\t\t\n' - md += '\t\t\n' - md += '\t\t\n' - md += '\t\t\n' - md += '\t\n' - - # Table rows - for line in lines: - parts = line.split(',') - packmode = parts[0].strip() - M = parts[1].split(':')[1].strip() - K = parts[2].split(':')[1].strip() - N = parts[3].split(':')[1].strip() - cycles = parts[4].split(':')[1].strip() - gflops = parts[5].split(':')[1].strip() - - md += '\t\n' - md += f'\t\t\n' - md += f'\t\t\n' - md += f'\t\t\n' - md += f'\t\t\n' - md += f'\t\t\n' - md += f'\t\t\n' - md += '\t\n' - - # End of the HTML table - md += '
PackModeMKNCyclesGFLOPS
{packmode}{M}{K}{N}{cycles}{gflops}
\n' - - with open(md_file, 'w') as f: - f.write(md) - - -class BenchmarkNTT(): - def __init__(self, arch: str, target: str, bin_path: str): +class Benchmark(): + def __init__(self, arch: str, target: str, bin_path: str, bin_prefix: str): self.arch = arch self.target = target self.bin_path = bin_path - self.bin_prefix = 'benchmark_ntt_' + self.bin_prefix = bin_prefix self.bin_list = self.traverse_dir() self.benchmark_list = [] @@ -133,6 +83,85 @@ def traverse_dir(self): file_list.append(bin) return file_list +class Benchmark_riscv64(): + def send_msg(self, sock, msg): + # Prefix each message with a 4-byte length (network byte order) + msg = struct.pack('>I', len(msg)) + msg + sock.sendall(msg) + + def recv_msg(self, sock): + # Read message length and unpack it into an integer + raw_msglen = self.recvall(sock, 4) + if not raw_msglen: + return None + msglen = struct.unpack('>I', raw_msglen)[0] + # Read the message data + return self.recvall(sock, msglen) + + def recvall(self, sock, n): + # Helper function to recv n bytes or return None if EOF is hit + data = bytearray() + while len(data) < n: + packet = sock.recv(n - len(data)) + if not packet: + return None + data.extend(packet) + return data + + def run_evb(self, bin): + # connect server + ip = nuc_ip() + port = nuc_port() + client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + client_socket.connect((ip, int(port))) + + # send target + dummy = self.recv_msg(client_socket) + target_dict = {} + target_dict['target'] = self.target + self.send_msg(client_socket, json.dumps(target_dict).encode()) + + # send header + dummy = self.recv_msg(client_socket) + header_dict = {} + header_dict['case'] = os.path.basename(bin) + header_dict['app'] = 1 + header_dict['kmodel'] = 0 + header_dict['inputs'] = 0 + header_dict['description'] = 0 + header_dict['outputs'] = 0 + header_dict['cfg_cmds'] = [] + self.send_msg(client_socket, json.dumps(header_dict).encode()) + + # send app + dummy = self.recv_msg(client_socket) + file_dict = {} + file_dict['file_name'] = os.path.basename(bin) + file_dict['file_size'] = os.path.getsize(bin) + self.send_msg(client_socket, json.dumps(file_dict).encode()) + dummy = self.recv_msg(client_socket) + with open(bin, 'rb') as f: + client_socket.sendall(f.read()) + + # get result + header_dict = {} + ret = self.recv_msg(client_socket) + header_dict = json.loads(ret.decode()) + msg = header_dict['msg'] + if header_dict['type'].find('finish') != -1: + self.send_msg(client_socket, f"recved msg".encode()) + client_socket.close() + return 0, msg[0] + else: + client_socket.close() + raise Exception(msg) + return -1, msg + + +class BenchmarkNTT(Benchmark): + def __init__(self, arch: str, target: str, bin_path: str): + Benchmark.__init__(self, arch, target, bin_path, 'benchmark_ntt_') + def parse_result(self, result: str): lines = result.split('\n') for line in lines: @@ -145,7 +174,6 @@ def parse_result(self, result: str): dict[f'{self.arch}_ratio'] = round( float(dict[f'{self.arch}_roofline']) / (float(dict[f'{self.arch}_actual']) + 0.01), 3) self.benchmark_list.append(dict) - def run(): pass @@ -234,7 +262,7 @@ def run(self): self.parse_result(cmd_result) -class BenchmarkNTT_riscv64(BenchmarkNTT): +class BenchmarkNTT_riscv64(BenchmarkNTT, Benchmark_riscv64): def __init__(self, target: str, bin_path: str): BenchmarkNTT.__init__(self, 'riscv64', target, bin_path) self.roofline_dict = {'binary': {'add': '10.3', @@ -311,78 +339,52 @@ def __init__(self, target: str, bin_path: str): }, } - def send_msg(self, sock, msg): - # Prefix each message with a 4-byte length (network byte order) - msg = struct.pack('>I', len(msg)) + msg - sock.sendall(msg) + def run(self): + if self.target not in kpu_targets(): + return - def recv_msg(self, sock): - # Read message length and unpack it into an integer - raw_msglen = self.recvall(sock, 4) - if not raw_msglen: - return None - msglen = struct.unpack('>I', raw_msglen)[0] - # Read the message data - return self.recvall(sock, msglen) + for bin in self.bin_list: + cmd_status, cmd_result = self.run_evb(bin) + assert (cmd_status == 0) + lines = cmd_result.split('\r\n') + new_lines = lines[1:-1] + new_cmd_result = '\n'.join(new_lines) + self.parse_result(new_cmd_result) - def recvall(self, sock, n): - # Helper function to recv n bytes or return None if EOF is hit - data = bytearray() - while len(data) < n: - packet = sock.recv(n - len(data)) - if not packet: - return None - data.extend(packet) - return data +class BenchmarkNTTMatmul(Benchmark): + def __init__(self, arch: str, target: str, bin_path: str): + Benchmark.__init__(self, arch, target, bin_path, 'benchmark_ntt_matmul_primitive_size') - def run_evb(self, bin): - # connect server - ip = nuc_ip() - port = nuc_port() - client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - client_socket.connect((ip, int(port))) + def parse_result(self, result: str): + lines = result.strip().split('\n') + for line in lines: + dict = {} + items = line.split(',') + dict['pack_mode'] = items[0].strip() + dict['M'] = items[1].split(':')[1].strip() + dict['K'] = items[2].split(':')[1].strip() + dict['N'] = items[3].split(':')[1].strip() + dict[f'{self.arch}_cycles'] = items[4].split(':')[1].strip() + dict[f'{self.arch}_gflops'] = items[5].split(':')[1].strip() + self.benchmark_list.append(dict) - # send target - dummy = self.recv_msg(client_socket) - target_dict = {} - target_dict['target'] = self.target - self.send_msg(client_socket, json.dumps(target_dict).encode()) + def run(): + pass - # send header - dummy = self.recv_msg(client_socket) - header_dict = {} - header_dict['case'] = os.path.basename(bin) - header_dict['app'] = 1 - header_dict['kmodel'] = 0 - header_dict['inputs'] = 0 - header_dict['description'] = 0 - header_dict['outputs'] = 0 - header_dict['cfg_cmds'] = [] - self.send_msg(client_socket, json.dumps(header_dict).encode()) - # send app - dummy = self.recv_msg(client_socket) - file_dict = {} - file_dict['file_name'] = os.path.basename(bin) - file_dict['file_size'] = os.path.getsize(bin) - self.send_msg(client_socket, json.dumps(file_dict).encode()) - dummy = self.recv_msg(client_socket) - with open(bin, 'rb') as f: - client_socket.sendall(f.read()) +class BenchmarkNTTMatmul_x86_64(BenchmarkNTTMatmul): + def __init__(self, target: str, bin_path: str): + BenchmarkNTTMatmul.__init__(self, 'x86_64', target, bin_path) - # get result - header_dict = {} - ret = self.recv_msg(client_socket) - header_dict = json.loads(ret.decode()) - msg = header_dict['msg'] - if header_dict['type'].find('finish') != -1: - self.send_msg(client_socket, f"recved msg".encode()) - client_socket.close() - return 0, msg[0] - else: - client_socket.close() - raise Exception(msg) - return -1, msg + def run(self): + for bin in self.bin_list: + cmd_status, cmd_result = subprocess.getstatusoutput(f'{bin}') + assert (cmd_status == 0) + self.parse_result(cmd_result) + +class BenchmarkNTTMatmul_riscv64(BenchmarkNTTMatmul, Benchmark_riscv64): + def __init__(self, target: str, bin_path: str): + BenchmarkNTTMatmul.__init__(self, 'riscv64', target, bin_path) def run(self): if self.target not in kpu_targets(): @@ -392,11 +394,10 @@ def run(self): cmd_status, cmd_result = self.run_evb(bin) assert (cmd_status == 0) lines = cmd_result.split('\r\n') - new_lines = lines[1:-1] + new_lines = lines[2:-1] new_cmd_result = '\n'.join(new_lines) self.parse_result(new_cmd_result) - if __name__ == '__main__': # parse parser = argparse.ArgumentParser(prog="benchmark_ntt") @@ -410,22 +411,38 @@ def run(self): type=str, default='riscv64_build/bin') args = parser.parse_args() - # x86_64 + # 1. benchmark ntt + # 1.1 x86_64 ntt_x86_64 = BenchmarkNTT_x86_64(args.x86_64_target, args.x86_64_path) ntt_x86_64.run() - # riscv64 + # 1.2 riscv64 ntt_riscv64 = BenchmarkNTT_riscv64(args.riscv64_target, args.riscv64_path) ntt_riscv64.run() - # merge benchmark list + # 1.3 merge benchmark list benchmark_list = [] for i in range(len(ntt_x86_64.benchmark_list)): item = {**ntt_x86_64.benchmark_list[i], **ntt_riscv64.benchmark_list[i]} benchmark_list.append(item) - # generate md - ntt_md = ntt_report_file('benchmark_ntt.md') - ntt_matmul_md = ntt_matmul_report_file('benchmark_ntt_matmul.md') - generate_benchmark_ntt_md(benchmark_list, ntt_md) - generate_benchmark_ntt_matmul_md(ntt_x86_64.bin_path, ntt_matmul_md) + # 1.4 generate md + md_file = ntt_report_file('benchmark_ntt.md') + benchmark_list = sorted(benchmark_list, key=lambda d: (d['kind'], d['op'])) + generate_benchmark_ntt_md(benchmark_list, 'kind', md_file) + + # 2. benchmark ntt matmul + # 2.1 x86_64 + benchmark_list = [] + ntt_matmul_x86_64 = BenchmarkNTTMatmul_x86_64(args.x86_64_target, args.x86_64_path) + ntt_matmul_x86_64.run() + benchmark_list = sorted(ntt_matmul_x86_64.benchmark_list, key=lambda d: (d['pack_mode'])) + md_file = ntt_matmul_x86_64_report_file('benchmark_ntt_matmul_x86_64.md') + generate_benchmark_ntt_md(benchmark_list, 'pack_mode', md_file) + + # 2.2 riscv64 + ntt_matmul_riscv64 = BenchmarkNTTMatmul_riscv64(args.riscv64_target, args.riscv64_path) + ntt_matmul_riscv64.run() + benchmark_list = sorted(ntt_matmul_riscv64.benchmark_list, key=lambda d: (d['pack_mode'])) + md_file = ntt_matmul_riscv64_report_file('benchmark_ntt_matmul_riscv64.md') + generate_benchmark_ntt_md(benchmark_list, 'pack_mode', md_file) \ No newline at end of file diff --git a/src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp b/src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp similarity index 98% rename from src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp rename to src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp index 3af82354a..07a99eab8 100644 --- a/src/Native/test/benchmark_test/benchmark_ntt_primitive_size.cpp +++ b/src/Native/test/benchmark_test/benchmark_ntt_matmul_primitive_size.cpp @@ -31,13 +31,12 @@ template void benchmark_ntt_matmul_pack_NONE() { asm volatile("" ::"g"(tc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M << ", K:" << K << ", N:" << N << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -74,13 +73,12 @@ template void benchmark_ntt_matmul_pack_K() { asm volatile("" ::"g"(tc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M << ", K:" << K / P << ", N:" << N << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -115,13 +113,12 @@ template void benchmark_ntt_matmul_pack_M() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M / P << ", K:" << K << ", N:" << N << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -156,13 +153,12 @@ template void benchmark_ntt_matmul_pack_N() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M << ", K:" << K << ", N:" << N / P << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -201,13 +197,12 @@ template void benchmark_ntt_matmul_pack_M_N() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M / P << ", K:" << K << ", N:" << N / P << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -246,13 +241,12 @@ template void benchmark_ntt_matmul_pack_M_K() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M / P << ", K:" << K / P << ", N:" << N << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -291,13 +285,12 @@ template void benchmark_ntt_matmul_pack_K_N() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M << ", K:" << K / P << ", N:" << N / P << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -338,13 +331,12 @@ template void benchmark_ntt_matmul_pack_M_K_N() { asm volatile("" ::"g"(pc)); auto ops = M * N * (2 * K - 1); - auto freq = 3.2 * 1e9; std::cout << (__FUNCTION__ + std::strlen("benchmark_ntt_matmul_pack_")) << std::setprecision(0) << std::fixed << ", M:" << M / P << ", K:" << K / P << ", N:" << N / P << ", Cycles:" << static_cast(t2 - t1) / run_num << ", GFLOPS:" - << ops / (static_cast(t2 - t1) / run_num / freq) * 1e-9 + << ops / (static_cast(t2 - t1) / run_num / CPU_FREQUENCY_MHZ) * 1e-3 << std::endl; } @@ -505,8 +497,6 @@ void matmul_primitive_analysis() { int main() { -#if __x86_64__ - { const auto PackMode = nncase::ntt::ukernels::mamtul_pack_kind::pack_m; matmul_primitive_analysis(); @@ -542,7 +532,5 @@ int main() { matmul_primitive_analysis(); } -#endif - return 0; } \ No newline at end of file diff --git a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp index 5a6d4b061..6eb3bbf25 100644 --- a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp +++ b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp @@ -43,6 +43,7 @@ std::string benchmark_ntt_reduce_add_reduceN_noPack() { ntt::reduce_sum>(ta, tb[warmup_num + i]); } auto t2 = NttTest::get_cpu_cycle(); + asm volatile("" ::"g"(tb)); std::ostringstream oss; oss << module << "_" << reduce_mode << "_" << reduce_direction << "_" diff --git a/src/Native/test/ntt_test.h b/src/Native/test/ntt_test.h index 4eaccf9fb..845a0e975 100644 --- a/src/Native/test/ntt_test.h +++ b/src/Native/test/ntt_test.h @@ -35,6 +35,12 @@ #define ULP_SIZE 10000 #endif +#if defined __x86_64 +#ifndef CPU_FREQUENCY_MHZ +#define CPU_FREQUENCY_MHZ 4100 +#endif + +#elif defined __riscv #ifndef CPU_FREQUENCY_MHZ #define CPU_FREQUENCY_MHZ 1600 #endif @@ -42,6 +48,7 @@ #ifndef CLOCK_SOURCE_FREQUENCY_MHZ #define CLOCK_SOURCE_FREQUENCY_MHZ 27 #endif +#endif namespace nncase { namespace NttTest {