diff --git a/code/.gitignore b/code/.gitignore index 2cc89621..72da920f 100644 --- a/code/.gitignore +++ b/code/.gitignore @@ -6,3 +6,4 @@ !*.sh !*.py !*.html +!*.csv diff --git a/code/Makefile b/code/Makefile index 925d0493..39cb42ef 100644 --- a/code/Makefile +++ b/code/Makefile @@ -17,4 +17,10 @@ clean: %: %.cpp %.h common.h $(CXX) $(CFLAGS) $< -mlsx -mlasx -o $@ +measure: measure.cpp measure.h + $(CXX) -O2 $< -o $@ + +run-measure: measure + ./measure -p + .SUFFIXES: diff --git a/code/measure.cpp b/code/measure.cpp new file mode 100644 index 00000000..d59675a2 --- /dev/null +++ b/code/measure.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// learned from lmbench lat_mem_rd +#define FIVE(X) X X X X X +#define TEN(X) FIVE(X) FIVE(X) +#define FIFTY(X) TEN(X) TEN(X) TEN(X) TEN(X) TEN(X) +#define HUNDRED(X) FIFTY(X) FIFTY(X) +#define THOUSAND(X) HUNDRED(TEN(X)) + +// get time or cycles +// unit: ns or cycle +uint64_t get_time(); +// prioritize cycle over time +void setup_time_or_cycles(); +uint64_t get_time_or_cycles(); + +void bind_to_core(); + +// perf related +void setup_perf_cycles(); +uint64_t perf_read_cycles(); +void setup_perf_instructions(); +uint64_t perf_read_instructions(); +void setup_perf_llc_misses(); +uint64_t perf_read_llc_misses(); +void setup_perf_llc_loads(); +uint64_t perf_read_llc_loads(); + +int N = 100000; + +#define INSTR_TEST(NAME, INST, ...) \ + void test_##NAME(int n) { \ + for (int i = 0; i < n; i++) { \ + asm volatile(".align 4\n" THOUSAND(INST) : : : __VA_ARGS__); \ + } \ + } + +#include "measure.h" + +#undef INSTR_TEST + +struct InstrTest { + const char *name; + const char *inst; + void (*test)(int); +}; + +#define INSTR_TEST(NAME, INST, ...) \ + InstrTest{.name = #NAME, .inst = #INST, .test = test_##NAME}, + +std::vector tests = { +#include "measure.h" +}; + +#undef INSTR_TEST + +struct InstrInfo { + std::set latency; + double throughput; +}; + +uint64_t get_time() { + struct timeval tv = {}; + gettimeofday(&tv, nullptr); + return (uint64_t)tv.tv_sec * 1000000000 + (uint64_t)tv.tv_usec * 1000; +} + +uint64_t perf_read_common(int fd) { + uint64_t counter; + int res = read(fd, &counter, sizeof(counter)); + assert(res == sizeof(counter)); + return counter; +} + +int setup_perf_common(uint32_t type, uint64_t config) { + struct perf_event_attr *attr = + (struct perf_event_attr *)malloc(sizeof(struct perf_event_attr)); + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = type; + attr->size = sizeof(struct perf_event_attr); + attr->config = config; + attr->disabled = 0; + attr->pinned = 1; + attr->inherit = 1; + attr->exclude_kernel = 1; + int fd = syscall(SYS_perf_event_open, attr, 0, -1, -1, 0); + if (fd < 0) { + perror("perf_event_open"); + fprintf(stderr, "try: sudo sysctl kernel.perf_event_paranoid=2"); + exit(1); + } + return fd; +} + +int perf_fd_cycles = -1; + +uint64_t perf_read_cycles() { return perf_read_common(perf_fd_cycles); } + +void setup_perf_cycles() { + perf_fd_cycles = + setup_perf_common(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + if (perf_fd_cycles >= 0) { + printf("Using PMU to count cycles\n"); + } +} + +int perf_fd_instructions = -1; + +uint64_t perf_read_instructions() { + return perf_read_common(perf_fd_instructions); +} + +void setup_perf_instructions() { + perf_fd_instructions = + setup_perf_common(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); +} + +int perf_fd_llc_misses = -1; +int perf_fd_llc_loads = -1; + +uint64_t perf_read_llc_misses() { return perf_read_common(perf_fd_llc_misses); } + +uint64_t perf_read_llc_loads() { return perf_read_common(perf_fd_llc_loads); } + +void setup_perf_llc_misses() { + perf_fd_llc_misses = setup_perf_common( + PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_LL) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)); +} + +void setup_perf_llc_loads() { + perf_fd_llc_loads = setup_perf_common( + PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_LL) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)); +} + +void setup_time_or_cycles() { setup_perf_cycles(); } + +uint64_t get_time_or_cycles() { + if (perf_fd_cycles >= 0) { + // cycle + return perf_read_cycles(); + } else { + // time + return get_time(); + } +} + +// bind to core 0 +void bind_to_core() { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(0, &set); + int res = sched_setaffinity(0, sizeof(set), &set); + if (res == 0) { + printf("Pinned to cpu 0\n"); + } +} + +int main(int argc, char *argv[]) { + bool perf = false; + + int opt; + while ((opt = getopt(argc, argv, "n:p")) != -1) { + switch (opt) { + case 'n': + sscanf(optarg, "%d", &N); + break; + case 'p': + perf = true; + break; + default: + fprintf(stderr, "Usage: %s [-p]\n", argv[0]); + exit(EXIT_FAILURE); + } + } + + bind_to_core(); + if (perf) { + setup_time_or_cycles(); + } + + // calibrate unit cycle time + uint64_t begin = get_time_or_cycles(); + test_unit(N); + uint64_t unit_elapsed = get_time_or_cycles() - begin; + + std::map info; + + for (auto it : tests) { + std::string name = it.name; + if (name == "unit") { + continue; + } + + begin = get_time_or_cycles(); + it.test(N); + uint64_t elapsed = get_time_or_cycles() - begin; + double cycles = (double)elapsed / unit_elapsed; + + std::string base_name; + size_t tp_index = name.find("_tp"); + if (tp_index != std::string::npos) { + base_name = name.substr(0, tp_index); + printf("%s: throughput 1/%.2lf=%.2lf instructions\n", it.name, cycles, + 1.0 / cycles); + info[base_name].throughput = 1.0 / cycles; + } else { + base_name = name; + + size_t last_underscore_index = name.find_last_of("_"); + if (last_underscore_index != std::string::npos) { + char *end = NULL; + const char *start = &name.c_str()[last_underscore_index + 1]; + strtol(start, &end, 10); + + if (start != end) { + // strip suffix _2 etc + base_name = name.substr(0, last_underscore_index); + } + } + + // round to 0.01 + cycles = (double)(long)(cycles * 100 + 0.5) / 100.0; + printf("%s: latency %.2lf cycles\n", it.name, cycles); + info[base_name].latency.insert(cycles); + } + } + + FILE *fp = fopen("measure.csv", "w"); + assert(fp); + fprintf(fp, "name,latency,throughput(cpi)\n"); + for (auto pair : info) { + std::string latency; + auto entry = pair.second; + for (auto lat : entry.latency) { + char buffer[32]; + std::sprintf(buffer, "%.2lf", lat); + if (!latency.empty()) { + latency += "/"; + } + latency += buffer; + } + fprintf(fp, "%s,%s,%.2lf\n", pair.first.c_str(), latency.c_str(), + entry.throughput); + } + printf("Result written to measure.csv\n"); +} diff --git a/code/measure.csv b/code/measure.csv new file mode 100644 index 00000000..02dc5330 --- /dev/null +++ b/code/measure.csv @@ -0,0 +1,9 @@ +name,latency,throughput(cpi) +vfmadd_s,5.00,2.00 +vfmsub_s,5.00,2.00 +vfnmadd_s,5.00,2.00 +vfnmsub_s,5.00,2.00 +xvfmadd_s,5.00,2.00 +xvfmsub_s,5.00,2.00 +xvfnmadd_s,5.00,2.00 +xvfnmsub_s,5.00,2.00 diff --git a/code/measure.h b/code/measure.h new file mode 100644 index 00000000..35d2f61f --- /dev/null +++ b/code/measure.h @@ -0,0 +1,33 @@ +INSTR_TEST(unit, "add.w $r12, $r1, $r12\n", "r12") +INSTR_TEST(vfmadd_s_1, "vfmadd.s $vr0, $vr0, $vr2, $vr3\n") +INSTR_TEST(vfmadd_s_2, "vfmadd.s $vr0, $vr1, $vr0, $vr3\n") +INSTR_TEST(vfmadd_s_3, "vfmadd.s $vr0, $vr1, $vr2, $vr0\n") +INSTR_TEST(vfmadd_s_tp, "vfmadd.s $vr0, $vr1, $vr2, $vr3\n") +INSTR_TEST(vfmsub_s_1, "vfmsub.s $vr0, $vr0, $vr2, $vr3\n") +INSTR_TEST(vfmsub_s_2, "vfmsub.s $vr0, $vr1, $vr0, $vr3\n") +INSTR_TEST(vfmsub_s_3, "vfmsub.s $vr0, $vr1, $vr2, $vr0\n") +INSTR_TEST(vfmsub_s_tp, "vfmsub.s $vr0, $vr1, $vr2, $vr3\n") +INSTR_TEST(vfnmadd_s_1, "vfnmadd.s $vr0, $vr0, $vr2, $vr3\n") +INSTR_TEST(vfnmadd_s_2, "vfnmadd.s $vr0, $vr1, $vr0, $vr3\n") +INSTR_TEST(vfnmadd_s_3, "vfnmadd.s $vr0, $vr1, $vr2, $vr0\n") +INSTR_TEST(vfnmadd_s_tp, "vfnmadd.s $vr0, $vr1, $vr2, $vr3\n") +INSTR_TEST(vfnmsub_s_1, "vfnmsub.s $vr0, $vr0, $vr2, $vr3\n") +INSTR_TEST(vfnmsub_s_2, "vfnmsub.s $vr0, $vr1, $vr0, $vr3\n") +INSTR_TEST(vfnmsub_s_3, "vfnmsub.s $vr0, $vr1, $vr2, $vr0\n") +INSTR_TEST(vfnmsub_s_tp, "vfnmsub.s $vr0, $vr1, $vr2, $vr3\n") +INSTR_TEST(xvfmadd_s_1, "xvfmadd.s $xr0, $xr0, $xr2, $xr3\n") +INSTR_TEST(xvfmadd_s_2, "xvfmadd.s $xr0, $xr1, $xr0, $xr3\n") +INSTR_TEST(xvfmadd_s_3, "xvfmadd.s $xr0, $xr1, $xr2, $xr0\n") +INSTR_TEST(xvfmadd_s_tp, "xvfmadd.s $xr0, $xr1, $xr2, $xr3\n") +INSTR_TEST(xvfmsub_s_1, "xvfmsub.s $xr0, $xr0, $xr2, $xr3\n") +INSTR_TEST(xvfmsub_s_2, "xvfmsub.s $xr0, $xr1, $xr0, $xr3\n") +INSTR_TEST(xvfmsub_s_3, "xvfmsub.s $xr0, $xr1, $xr2, $xr0\n") +INSTR_TEST(xvfmsub_s_tp, "xvfmsub.s $xr0, $xr1, $xr2, $xr3\n") +INSTR_TEST(xvfnmadd_s_1, "xvfnmadd.s $xr0, $xr0, $xr2, $xr3\n") +INSTR_TEST(xvfnmadd_s_2, "xvfnmadd.s $xr0, $xr1, $xr0, $xr3\n") +INSTR_TEST(xvfnmadd_s_3, "xvfnmadd.s $xr0, $xr1, $xr2, $xr0\n") +INSTR_TEST(xvfnmadd_s_tp, "xvfnmadd.s $xr0, $xr1, $xr2, $xr3\n") +INSTR_TEST(xvfnmsub_s_1, "xvfnmsub.s $xr0, $xr0, $xr2, $xr3\n") +INSTR_TEST(xvfnmsub_s_2, "xvfnmsub.s $xr0, $xr1, $xr0, $xr3\n") +INSTR_TEST(xvfnmsub_s_3, "xvfnmsub.s $xr0, $xr1, $xr2, $xr0\n") +INSTR_TEST(xvfnmsub_s_tp, "xvfnmsub.s $xr0, $xr1, $xr2, $xr3\n") diff --git a/code/vsetallnez_b.cpp b/code/vsetallnez_b.cpp index 3a4b5807..45b0a28c 100644 --- a/code/vsetallnez_b.cpp +++ b/code/vsetallnez_b.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(!__lsx_bnz_b(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(!__lsx_bnz_b(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(!__lsx_bnz_b(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(__lsx_bnz_b(a)); - a = __m128i {0x1111111111111100, 0x1111111111111111}; + a = __m128i{0x1111111111111100, 0x1111111111111111}; assert(!__lsx_bnz_b(a)); } \ No newline at end of file diff --git a/code/vsetallnez_d.cpp b/code/vsetallnez_d.cpp index 17af4b4b..82f623f7 100644 --- a/code/vsetallnez_d.cpp +++ b/code/vsetallnez_d.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(!__lsx_bnz_d(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(!__lsx_bnz_d(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(!__lsx_bnz_d(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(__lsx_bnz_d(a)); - a = __m128i {0x0, 0x1111111111111111}; + a = __m128i{0x0, 0x1111111111111111}; assert(!__lsx_bnz_d(a)); } \ No newline at end of file diff --git a/code/vsetallnez_h.cpp b/code/vsetallnez_h.cpp index d1524475..68cd64c1 100644 --- a/code/vsetallnez_h.cpp +++ b/code/vsetallnez_h.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(!__lsx_bnz_h(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(!__lsx_bnz_h(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(!__lsx_bnz_h(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(__lsx_bnz_h(a)); - a = __m128i {0x1111111111110000, 0x1111111111111111}; + a = __m128i{0x1111111111110000, 0x1111111111111111}; assert(!__lsx_bnz_h(a)); } \ No newline at end of file diff --git a/code/vsetallnez_w.cpp b/code/vsetallnez_w.cpp index 80e38c62..54fed88a 100644 --- a/code/vsetallnez_w.cpp +++ b/code/vsetallnez_w.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(!__lsx_bnz_w(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(!__lsx_bnz_w(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(!__lsx_bnz_w(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(__lsx_bnz_w(a)); - a = __m128i {0x1111111100000000, 0x1111111111111111}; + a = __m128i{0x1111111100000000, 0x1111111111111111}; assert(!__lsx_bnz_w(a)); } \ No newline at end of file diff --git a/code/vsetanyeqz_b.cpp b/code/vsetanyeqz_b.cpp index df2dd2c1..1523ae3a 100644 --- a/code/vsetanyeqz_b.cpp +++ b/code/vsetanyeqz_b.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(__lsx_bz_b(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(__lsx_bz_b(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(__lsx_bz_b(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(!__lsx_bz_b(a)); - a = __m128i {0x1111111111111100, 0x1111111111111111}; + a = __m128i{0x1111111111111100, 0x1111111111111111}; assert(__lsx_bz_b(a)); } \ No newline at end of file diff --git a/code/vsetanyeqz_d.cpp b/code/vsetanyeqz_d.cpp index efdf3c37..3b1eaae7 100644 --- a/code/vsetanyeqz_d.cpp +++ b/code/vsetanyeqz_d.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(__lsx_bz_d(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(__lsx_bz_d(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(__lsx_bz_d(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(!__lsx_bz_d(a)); - a = __m128i {0x0, 0x1111111111111111}; + a = __m128i{0x0, 0x1111111111111111}; assert(__lsx_bz_d(a)); } \ No newline at end of file diff --git a/code/vsetanyeqz_h.cpp b/code/vsetanyeqz_h.cpp index 883e00d7..5e772e64 100644 --- a/code/vsetanyeqz_h.cpp +++ b/code/vsetanyeqz_h.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(__lsx_bz_h(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(__lsx_bz_h(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(__lsx_bz_h(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(!__lsx_bz_h(a)); - a = __m128i {0x1111111111110000, 0x1111111111111111}; + a = __m128i{0x1111111111110000, 0x1111111111111111}; assert(__lsx_bz_h(a)); } \ No newline at end of file diff --git a/code/vsetanyeqz_w.cpp b/code/vsetanyeqz_w.cpp index 444f1e8e..de1d136b 100644 --- a/code/vsetanyeqz_w.cpp +++ b/code/vsetanyeqz_w.cpp @@ -4,15 +4,15 @@ void test() { __m128i a = {0, 0}; assert(__lsx_bz_w(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(__lsx_bz_w(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(__lsx_bz_b(a)); - a = __m128i {0x1111111111111111, 0x1111111111111111}; + a = __m128i{0x1111111111111111, 0x1111111111111111}; assert(!__lsx_bz_w(a)); - a = __m128i {0x1111111100000000, 0x1111111111111111}; + a = __m128i{0x1111111100000000, 0x1111111111111111}; assert(__lsx_bz_w(a)); } \ No newline at end of file diff --git a/code/vseteqz_v.cpp b/code/vseteqz_v.cpp index 43d25367..f6ec8b88 100644 --- a/code/vseteqz_v.cpp +++ b/code/vseteqz_v.cpp @@ -4,9 +4,9 @@ void test() { __m128i a = {0, 0}; assert(__lsx_bz_v(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(!__lsx_bz_v(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(!__lsx_bz_v(a)); } \ No newline at end of file diff --git a/code/vsetnez_v.cpp b/code/vsetnez_v.cpp index 6aa0bdc0..3eb88701 100644 --- a/code/vsetnez_v.cpp +++ b/code/vsetnez_v.cpp @@ -4,9 +4,9 @@ void test() { __m128i a = {0, 0}; assert(!__lsx_bnz_v(a)); - a = __m128i {1, 0}; + a = __m128i{1, 0}; assert(__lsx_bnz_v(a)); - a = __m128i {0, 1}; + a = __m128i{0, 1}; assert(__lsx_bnz_v(a)); } \ No newline at end of file diff --git a/gen_measure.py b/gen_measure.py new file mode 100644 index 00000000..98f2a22c --- /dev/null +++ b/gen_measure.py @@ -0,0 +1,50 @@ +import glob + +# find known insts from binutils-gdb +with open('code/measure.h', 'w') as f: + # measure unit time + print('INSTR_TEST(unit, "add.w $r12, $r1, $r12\\n", "r12")', file=f) + + for line in open("../binutils-gdb/opcodes/loongarch-opc.c"): + line = line.strip() + if line.startswith("{") and line.endswith("},"): + parts = line.split(",") + if '"' in parts[2]: + name = parts[2].split('"')[1] + fmt = line.split('"')[3] + + if name.startswith("v") or name.startswith("xv"): + print("Processing", name, fmt) + + # latency test + # at least one op depends on vd + fmt_parts = fmt.split(",") + for depend_i in range(1, len(fmt_parts)): + ops = [] + for i, part in enumerate(fmt_parts): + if part.startswith("v"): + if i == depend_i: + # vd is always vr0 + ops.append("$vr0") + else: + ops.append(f"$vr{i}") + elif part.startswith("x"): + if i == depend_i: + # vd is always vr0 + ops.append("$xr0") + else: + ops.append(f"$xr{i}") + print(f'INSTR_TEST({name.replace(".", "_")}_{depend_i}, "{name} {", ".join(ops)}\\n")', file=f) + + # throughput test + # no dependency + ops = [] + for i, part in enumerate(fmt_parts): + if part.startswith("v"): + ops.append(f"$vr{i}") + elif part.startswith("x"): + ops.append(f"$xr{i}") + print(f'INSTR_TEST({name.replace(".", "_")}_tp, "{name} {", ".join(ops)}\\n")', file=f) + + if "fcmp" in name: + break \ No newline at end of file