Skip to content

Commit

Permalink
Add initial latency and throughput measurement
Browse files Browse the repository at this point in the history
  • Loading branch information
jiegec committed Dec 14, 2023
1 parent 9168287 commit 443a076
Show file tree
Hide file tree
Showing 16 changed files with 399 additions and 36 deletions.
1 change: 1 addition & 0 deletions code/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
!*.sh
!*.py
!*.html
!*.csv
6 changes: 6 additions & 0 deletions code/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,10 @@ clean:
%: %.cpp %.h common.h
$(CXX) $(CFLAGS) $< -mlsx -mlasx -o $@

measure: measure.cpp measure.h
$(CXX) -O2 $< -o $@

run-measure: measure
./measure -p

.SUFFIXES:
264 changes: 264 additions & 0 deletions code/measure.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#include <assert.h>
#include <linux/perf_event.h>
#include <map>
#include <sched.h>
#include <set>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>
#include <utility>
#include <vector>

// learned from lmbench lat_mem_rd
#define FIVE(X) X X X X X
#define TEN(X) FIVE(X) FIVE(X)
#define FIFTY(X) TEN(X) TEN(X) TEN(X) TEN(X) TEN(X)
#define HUNDRED(X) FIFTY(X) FIFTY(X)
#define THOUSAND(X) HUNDRED(TEN(X))

// get time or cycles
// unit: ns or cycle
uint64_t get_time();
// prioritize cycle over time
void setup_time_or_cycles();
uint64_t get_time_or_cycles();

void bind_to_core();

// perf related
void setup_perf_cycles();
uint64_t perf_read_cycles();
void setup_perf_instructions();
uint64_t perf_read_instructions();
void setup_perf_llc_misses();
uint64_t perf_read_llc_misses();
void setup_perf_llc_loads();
uint64_t perf_read_llc_loads();

int N = 100000;

#define INSTR_TEST(NAME, INST, ...) \
void test_##NAME(int n) { \
for (int i = 0; i < n; i++) { \
asm volatile(".align 4\n" THOUSAND(INST) : : : __VA_ARGS__); \
} \
}

#include "measure.h"

#undef INSTR_TEST

struct InstrTest {
const char *name;
const char *inst;
void (*test)(int);
};

#define INSTR_TEST(NAME, INST, ...) \
InstrTest{.name = #NAME, .inst = #INST, .test = test_##NAME},

std::vector<InstrTest> tests = {
#include "measure.h"
};

#undef INSTR_TEST

struct InstrInfo {
std::set<double> latency;
double throughput;
};

uint64_t get_time() {
struct timeval tv = {};
gettimeofday(&tv, nullptr);
return (uint64_t)tv.tv_sec * 1000000000 + (uint64_t)tv.tv_usec * 1000;
}

uint64_t perf_read_common(int fd) {
uint64_t counter;
int res = read(fd, &counter, sizeof(counter));
assert(res == sizeof(counter));
return counter;
}

int setup_perf_common(uint32_t type, uint64_t config) {
struct perf_event_attr *attr =
(struct perf_event_attr *)malloc(sizeof(struct perf_event_attr));
memset(attr, 0, sizeof(struct perf_event_attr));
attr->type = type;
attr->size = sizeof(struct perf_event_attr);
attr->config = config;
attr->disabled = 0;
attr->pinned = 1;
attr->inherit = 1;
attr->exclude_kernel = 1;
int fd = syscall(SYS_perf_event_open, attr, 0, -1, -1, 0);
if (fd < 0) {
perror("perf_event_open");
fprintf(stderr, "try: sudo sysctl kernel.perf_event_paranoid=2");
exit(1);
}
return fd;
}

int perf_fd_cycles = -1;

uint64_t perf_read_cycles() { return perf_read_common(perf_fd_cycles); }

void setup_perf_cycles() {
perf_fd_cycles =
setup_perf_common(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES);
if (perf_fd_cycles >= 0) {
printf("Using PMU to count cycles\n");
}
}

int perf_fd_instructions = -1;

uint64_t perf_read_instructions() {
return perf_read_common(perf_fd_instructions);
}

void setup_perf_instructions() {
perf_fd_instructions =
setup_perf_common(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
}

int perf_fd_llc_misses = -1;
int perf_fd_llc_loads = -1;

uint64_t perf_read_llc_misses() { return perf_read_common(perf_fd_llc_misses); }

uint64_t perf_read_llc_loads() { return perf_read_common(perf_fd_llc_loads); }

void setup_perf_llc_misses() {
perf_fd_llc_misses = setup_perf_common(
PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_LL) |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16));
}

void setup_perf_llc_loads() {
perf_fd_llc_loads = setup_perf_common(
PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_LL) |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16));
}

void setup_time_or_cycles() { setup_perf_cycles(); }

uint64_t get_time_or_cycles() {
if (perf_fd_cycles >= 0) {
// cycle
return perf_read_cycles();
} else {
// time
return get_time();
}
}

// bind to core 0
void bind_to_core() {
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(0, &set);
int res = sched_setaffinity(0, sizeof(set), &set);
if (res == 0) {
printf("Pinned to cpu 0\n");
}
}

int main(int argc, char *argv[]) {
bool perf = false;

int opt;
while ((opt = getopt(argc, argv, "n:p")) != -1) {
switch (opt) {
case 'n':
sscanf(optarg, "%d", &N);
break;
case 'p':
perf = true;
break;
default:
fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
exit(EXIT_FAILURE);
}
}

bind_to_core();
if (perf) {
setup_time_or_cycles();
}

// calibrate unit cycle time
uint64_t begin = get_time_or_cycles();
test_unit(N);
uint64_t unit_elapsed = get_time_or_cycles() - begin;

std::map<std::string, InstrInfo> info;

for (auto it : tests) {
std::string name = it.name;
if (name == "unit") {
continue;
}

begin = get_time_or_cycles();
it.test(N);
uint64_t elapsed = get_time_or_cycles() - begin;
double cycles = (double)elapsed / unit_elapsed;

std::string base_name;
size_t tp_index = name.find("_tp");
if (tp_index != std::string::npos) {
base_name = name.substr(0, tp_index);
printf("%s: throughput 1/%.2lf=%.2lf instructions\n", it.name, cycles,
1.0 / cycles);
info[base_name].throughput = 1.0 / cycles;
} else {
base_name = name;

size_t last_underscore_index = name.find_last_of("_");
if (last_underscore_index != std::string::npos) {
char *end = NULL;
const char *start = &name.c_str()[last_underscore_index + 1];
strtol(start, &end, 10);

if (start != end) {
// strip suffix _2 etc
base_name = name.substr(0, last_underscore_index);
}
}

// round to 0.01
cycles = (double)(long)(cycles * 100 + 0.5) / 100.0;
printf("%s: latency %.2lf cycles\n", it.name, cycles);
info[base_name].latency.insert(cycles);
}
}

FILE *fp = fopen("measure.csv", "w");
assert(fp);
fprintf(fp, "name,latency,throughput(cpi)\n");
for (auto pair : info) {
std::string latency;
auto entry = pair.second;
for (auto lat : entry.latency) {
char buffer[32];
std::sprintf(buffer, "%.2lf", lat);
if (!latency.empty()) {
latency += "/";
}
latency += buffer;
}
fprintf(fp, "%s,%s,%.2lf\n", pair.first.c_str(), latency.c_str(),
entry.throughput);
}
printf("Result written to measure.csv\n");
}
9 changes: 9 additions & 0 deletions code/measure.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name,latency,throughput(cpi)
vfmadd_s,5.00,2.00
vfmsub_s,5.00,2.00
vfnmadd_s,5.00,2.00
vfnmsub_s,5.00,2.00
xvfmadd_s,5.00,2.00
xvfmsub_s,5.00,2.00
xvfnmadd_s,5.00,2.00
xvfnmsub_s,5.00,2.00
33 changes: 33 additions & 0 deletions code/measure.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
INSTR_TEST(unit, "add.w $r12, $r1, $r12\n", "r12")
INSTR_TEST(vfmadd_s_1, "vfmadd.s $vr0, $vr0, $vr2, $vr3\n")
INSTR_TEST(vfmadd_s_2, "vfmadd.s $vr0, $vr1, $vr0, $vr3\n")
INSTR_TEST(vfmadd_s_3, "vfmadd.s $vr0, $vr1, $vr2, $vr0\n")
INSTR_TEST(vfmadd_s_tp, "vfmadd.s $vr0, $vr1, $vr2, $vr3\n")
INSTR_TEST(vfmsub_s_1, "vfmsub.s $vr0, $vr0, $vr2, $vr3\n")
INSTR_TEST(vfmsub_s_2, "vfmsub.s $vr0, $vr1, $vr0, $vr3\n")
INSTR_TEST(vfmsub_s_3, "vfmsub.s $vr0, $vr1, $vr2, $vr0\n")
INSTR_TEST(vfmsub_s_tp, "vfmsub.s $vr0, $vr1, $vr2, $vr3\n")
INSTR_TEST(vfnmadd_s_1, "vfnmadd.s $vr0, $vr0, $vr2, $vr3\n")
INSTR_TEST(vfnmadd_s_2, "vfnmadd.s $vr0, $vr1, $vr0, $vr3\n")
INSTR_TEST(vfnmadd_s_3, "vfnmadd.s $vr0, $vr1, $vr2, $vr0\n")
INSTR_TEST(vfnmadd_s_tp, "vfnmadd.s $vr0, $vr1, $vr2, $vr3\n")
INSTR_TEST(vfnmsub_s_1, "vfnmsub.s $vr0, $vr0, $vr2, $vr3\n")
INSTR_TEST(vfnmsub_s_2, "vfnmsub.s $vr0, $vr1, $vr0, $vr3\n")
INSTR_TEST(vfnmsub_s_3, "vfnmsub.s $vr0, $vr1, $vr2, $vr0\n")
INSTR_TEST(vfnmsub_s_tp, "vfnmsub.s $vr0, $vr1, $vr2, $vr3\n")
INSTR_TEST(xvfmadd_s_1, "xvfmadd.s $xr0, $xr0, $xr2, $xr3\n")
INSTR_TEST(xvfmadd_s_2, "xvfmadd.s $xr0, $xr1, $xr0, $xr3\n")
INSTR_TEST(xvfmadd_s_3, "xvfmadd.s $xr0, $xr1, $xr2, $xr0\n")
INSTR_TEST(xvfmadd_s_tp, "xvfmadd.s $xr0, $xr1, $xr2, $xr3\n")
INSTR_TEST(xvfmsub_s_1, "xvfmsub.s $xr0, $xr0, $xr2, $xr3\n")
INSTR_TEST(xvfmsub_s_2, "xvfmsub.s $xr0, $xr1, $xr0, $xr3\n")
INSTR_TEST(xvfmsub_s_3, "xvfmsub.s $xr0, $xr1, $xr2, $xr0\n")
INSTR_TEST(xvfmsub_s_tp, "xvfmsub.s $xr0, $xr1, $xr2, $xr3\n")
INSTR_TEST(xvfnmadd_s_1, "xvfnmadd.s $xr0, $xr0, $xr2, $xr3\n")
INSTR_TEST(xvfnmadd_s_2, "xvfnmadd.s $xr0, $xr1, $xr0, $xr3\n")
INSTR_TEST(xvfnmadd_s_3, "xvfnmadd.s $xr0, $xr1, $xr2, $xr0\n")
INSTR_TEST(xvfnmadd_s_tp, "xvfnmadd.s $xr0, $xr1, $xr2, $xr3\n")
INSTR_TEST(xvfnmsub_s_1, "xvfnmsub.s $xr0, $xr0, $xr2, $xr3\n")
INSTR_TEST(xvfnmsub_s_2, "xvfnmsub.s $xr0, $xr1, $xr0, $xr3\n")
INSTR_TEST(xvfnmsub_s_3, "xvfnmsub.s $xr0, $xr1, $xr2, $xr0\n")
INSTR_TEST(xvfnmsub_s_tp, "xvfnmsub.s $xr0, $xr1, $xr2, $xr3\n")
8 changes: 4 additions & 4 deletions code/vsetallnez_b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ void test() {
__m128i a = {0, 0};
assert(!__lsx_bnz_b(a));

a = __m128i {1, 0};
a = __m128i{1, 0};
assert(!__lsx_bnz_b(a));

a = __m128i {0, 1};
a = __m128i{0, 1};
assert(!__lsx_bnz_b(a));

a = __m128i {0x1111111111111111, 0x1111111111111111};
a = __m128i{0x1111111111111111, 0x1111111111111111};
assert(__lsx_bnz_b(a));

a = __m128i {0x1111111111111100, 0x1111111111111111};
a = __m128i{0x1111111111111100, 0x1111111111111111};
assert(!__lsx_bnz_b(a));
}
8 changes: 4 additions & 4 deletions code/vsetallnez_d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ void test() {
__m128i a = {0, 0};
assert(!__lsx_bnz_d(a));

a = __m128i {1, 0};
a = __m128i{1, 0};
assert(!__lsx_bnz_d(a));

a = __m128i {0, 1};
a = __m128i{0, 1};
assert(!__lsx_bnz_d(a));

a = __m128i {0x1111111111111111, 0x1111111111111111};
a = __m128i{0x1111111111111111, 0x1111111111111111};
assert(__lsx_bnz_d(a));

a = __m128i {0x0, 0x1111111111111111};
a = __m128i{0x0, 0x1111111111111111};
assert(!__lsx_bnz_d(a));
}
8 changes: 4 additions & 4 deletions code/vsetallnez_h.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ void test() {
__m128i a = {0, 0};
assert(!__lsx_bnz_h(a));

a = __m128i {1, 0};
a = __m128i{1, 0};
assert(!__lsx_bnz_h(a));

a = __m128i {0, 1};
a = __m128i{0, 1};
assert(!__lsx_bnz_h(a));

a = __m128i {0x1111111111111111, 0x1111111111111111};
a = __m128i{0x1111111111111111, 0x1111111111111111};
assert(__lsx_bnz_h(a));

a = __m128i {0x1111111111110000, 0x1111111111111111};
a = __m128i{0x1111111111110000, 0x1111111111111111};
assert(!__lsx_bnz_h(a));
}
Loading

0 comments on commit 443a076

Please sign in to comment.