-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement a series of micro workloads to experience the functionality of xsave/xrestore in user space. These micro benchmarks are valuable for performance tuning and debugging purposes. Signed-off-by: Dongcheng Yan <[email protected]> Signed-off-by: Yi Sun <[email protected]>
- Loading branch information
Showing
28 changed files
with
2,436 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
yogini | ||
*.o | ||
*.S | ||
result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# SPDX-License-Identifier: GPL-2.0-only | ||
# | ||
# Copyright (c) 2022 Intel Corporation. | ||
# Len Brown <[email protected]> | ||
# Yi Sun <[email protected]> | ||
# Dongcheng Yan <[email protected]> | ||
|
||
CC = $(CROSS_COMPILE)gcc | ||
BUILD_OUTPUT := $(CURDIR) | ||
PREFIX ?= /usr | ||
DESTDIR ?= | ||
DAY := $(shell date +%Y.%m.%d) | ||
DATE_TIME := $(shell date +%Y%m%d_%H%M%S) | ||
|
||
ifeq ("$(origin O)", "command line") | ||
BUILD_OUTPUT := $(O) | ||
endif | ||
|
||
PROGS= yogini | ||
SRC= yogini.c work_AMX.c work_AVX.c work_AVX2.c work_AVX512.c work_VNNI512.c work_VNNI.c work_DOTPROD.c work_PAUSE.c work_TPAUSE.c work_UMWAIT.c work_RDTSC.c work_SSE.c work_MEM.c work_memcpy.c run_common.c worker_init4.c worker_init_dotprod.c worker_init_amx.c yogini.h | ||
OBJS= yogini.o work_AMX.o work_AVX.o work_AVX2.o work_AVX512.o work_VNNI512.o $(GCC11_OBJS) work_DOTPROD.o work_PAUSE.o work_TPAUSE.o work_UMWAIT.o work_RDTSC.o work_SSE.o work_MEM.o work_memcpy.o | ||
ASMS= work_AMX.S work_AVX.S work_AVX2.S work_AVX512.S work_VNNI512.S work_VNNI.S work_DOTPROD.S work_PAUSE.S work_TPAUSE.S work_UMWAIT.S work_RDTSC.S work_SSE.S work_MEM.S work_memcpy.S | ||
GCC11_OBJS=work_VNNI.o | ||
|
||
yogini : $(OBJS) $(ASMS) | ||
ifeq ($(DEBUG), 1) | ||
override CFLAGS += -march=sapphirerapids -g | ||
else | ||
override CFLAGS += -march=native | ||
endif | ||
override CFLAGS += -D_FORTIFY_SOURCE=2 | ||
override CFLAGS += -Wall | ||
override CFLAGS += -O3 | ||
override CFLAGS += -mtune=skylake-avx512 | ||
#override CFLAGS += -mtune=alderlake | ||
override CFLAGS += -mavx512bf16 | ||
|
||
LDFLAGS += -lm | ||
LDFLAGS += -lpthread | ||
|
||
%: %.c %.h | ||
@mkdir -p $(BUILD_OUTPUT) | ||
$(CC) $(CFLAGS) $(OBJS) -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) | ||
|
||
%.S: %.c | ||
@mkdir -p $(BUILD_OUTPUT) | ||
$(CC) $(CFLAGS) -S $^ -o $(BUILD_OUTPUT)/$@ | ||
|
||
.PHONY : clean | ||
clean : | ||
@rm -f $(BUILD_OUTPUT)/yogini $(OBJS) $(ASMS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Intel SIMD Instruction Microbenchmark Suite | ||
|
||
The Intel SIMD Instruction Microbenchmark Suite is a collection of microbenchmarks designed to evaluate and debug various Intel SIMD instructions. The suite includes a total of 15 benchmarks covering instructions such as AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. These benchmarks are primarily used for debugging xsave/xrestor related issues. | ||
|
||
## Features | ||
* Provides a comprehensive set of microbenchmarks for Intel SIMD instructions. | ||
* Covers a range of instruction sets, including AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. | ||
* Assists in debugging xsave/xrestor related problems. | ||
* Lightweight and easy to use. | ||
* Open-source and freely available. | ||
|
||
## Getting Started | ||
These instructions will help you get a copy of the Intel SIMD Instruction Microbenchmark Suite up and running on your local machine for development and testing purposes. | ||
|
||
### Prerequisites | ||
* C/C++ compiler with support for the desired SIMD instruction sets. | ||
* Make installed. | ||
|
||
### Installation | ||
Clone the repository to your local machine: | ||
``` | ||
git clone https://github.com/intel-sandbox/workload-xsave.git | ||
``` | ||
Enter the project directory: | ||
``` | ||
cd workload-xsave | ||
``` | ||
Build the benchmarks using CMake: | ||
``` | ||
make | ||
``` | ||
Or, if your platform is not support all SIMD feature, you can try | ||
``` | ||
DEBUG=1 make | ||
``` | ||
Run the benchmarks: | ||
``` | ||
usage: ./yogini [OPTIONS] | ||
./yogini runs some simple micro workloads | ||
-w, --workload [AVX,AVX2,AVX512,AMX,MEM,memcpy,SSE,VNNI,VNNI512,UMWAIT,TPAUSE,PAUSE,RDTSC] | ||
-r, --repeat, each instance needs to be run | ||
-b, --break_reason, [yield/sleep/trap/signal/futex]Available workloads: AMX memcpy MEM SSE RDTSC PAUSE DOTPROD VNNI512 AVX512_BF16 AVX2 AVX | ||
``` | ||
|
||
## Contributing | ||
Contributions are welcome and encouraged! If you would like to contribute to the Intel SIMD Instruction Microbenchmark Suite, please follow these steps: | ||
|
||
* License | ||
This project is licensed under the GPL2.0. | ||
* Architecture/Workflow | ||
![2023-07-07_15-03](https://github.com/intel-sandbox/workload-xsave/assets/1448148/1485edf4-91f5-4f46-ab34-a4eed9ff77f5) | ||
|
||
## Acknowledgments | ||
Mention any contributors or references you have used for this project. | ||
Contact | ||
For any questions or suggestions regarding the Intel SIMD Instruction Microbenchmark Suite, please contact [Yi Sun <[email protected]>]. | ||
|
||
Replace [your-email-address] with an appropriate contact email or remove this section if not necessary. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// SPDX-License-Identifier: GPL-2.0-only | ||
/* | ||
* generic worker code for re-use via inclusion | ||
* | ||
* Copyright (c) 2022 Intel Corporation. | ||
* Len Brown <[email protected]> | ||
* Yi Sun <[email protected]> | ||
* Dongcheng Yan <[email protected]> | ||
* | ||
*/ | ||
|
||
#include <time.h> | ||
#include <stdint.h> | ||
#include "yogini.h" | ||
|
||
void thread_break(int32_t reason, uint32_t thread_idx); | ||
/* | ||
* run() | ||
* complete work in chunks of "data_entries" operations | ||
* between each chunk, check the time | ||
* return when requested operations complete, or out of time | ||
* | ||
* return operationds completed | ||
*/ | ||
static unsigned long long run(struct work_instance *wi) | ||
{ | ||
unsigned int count; | ||
unsigned int operations = wi->repeat; | ||
struct thread_data *dp = wi->worker_data; | ||
|
||
if (operations == 0) | ||
operations = (~0U); | ||
|
||
for (count = 0; count < operations; count++) { | ||
thread_break(wi->break_reason, wi->thread_number); | ||
/* each invocation of work() does "entries" operations */ | ||
work(dp); | ||
} | ||
unsigned long long tsc_now = rdtsc(); | ||
return tsc_now; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# SPDX-License-Identifier: GPL-2.0-only | ||
# | ||
# Copyright (c) 2022 Intel Corporation. | ||
# Yi Sun <[email protected]> | ||
# Dongcheng Yan <[email protected]> | ||
|
||
#!/bin/bash | ||
option="" | ||
result="" | ||
repeat="" | ||
num=$# | ||
|
||
# store test results to a specified folder | ||
script_dir=$(dirname "$0") | ||
result_dir="$script_dir/result" | ||
|
||
if [ ! -d "$result_dir" ]; then | ||
mkdir "$result_dir" | ||
fi | ||
|
||
# mode1: test workloads in specific break_reason | ||
test_single () { | ||
echo "trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option" | ||
trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option | ||
if [ $? -ne 0 ]; then | ||
echo "Failed to execute trace-cmd record." | ||
exit 1 | ||
fi | ||
trace-cmd report > "${result_dir}/${result}${break_reason}" | ||
} | ||
|
||
# mode2: test workloads in all break_reason | ||
test_all () { | ||
for ((i=1; i<=5; i++)) | ||
do | ||
break_reason=$i | ||
test_single | ||
done | ||
} | ||
|
||
usage() { | ||
GREEN='\033[0;32m' | ||
NC='\033[0m' | ||
echo -e "${GREEN}Usage:${NC}" | ||
echo "first param: break_reason" | ||
echo "second param: repeat_cnt" | ||
echo "remain params: workload" | ||
echo -e "${GREEN}Example:${NC}" | ||
echo "$0 2 100 AVX MEM VNNI" | ||
echo "$0 -1 100 AVX MEM VNNI" | ||
echo -e "${GREEN}You can test all break_reason if first param is -1.${NC} " | ||
} | ||
|
||
# main func | ||
if [ $num -lt 3 ]; then | ||
usage | ||
else | ||
break_reason=$1 | ||
repeat=$2 | ||
for ((i=3; i<=$#; i++)) | ||
do | ||
# add workload cmd | ||
arg="${!i}" | ||
option+="-w $arg " | ||
result+="${arg}_" | ||
done | ||
|
||
if [ "$1" == "-1" ]; then | ||
test_all "$@" | ||
else | ||
test_single "$@" | ||
fi | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
// SPDX-License-Identifier: GPL-2.0-only | ||
/* | ||
* work_AVX.c - offer the "AVX" workload to yogini | ||
* | ||
* Copyright (c) 2022 Intel Corporation. | ||
* Yi Sun <[email protected]> | ||
* Dongcheng Yan <[email protected]> | ||
* | ||
*/ | ||
|
||
#define _GNU_SOURCE | ||
#include <stdio.h> /* printf(3) */ | ||
#include <stdlib.h> /* random(3) */ | ||
#include <sched.h> /* CPU_SET */ | ||
#include <sched.h> | ||
#include <xmmintrin.h> | ||
#include <immintrin.h> | ||
#include "yogini.h" | ||
#include <err.h> | ||
#include <stdint.h> | ||
#include <sys/syscall.h> | ||
#include <unistd.h> | ||
|
||
// #pragma GCC target("amx") | ||
#define WORKLOAD_NAME "AMX" | ||
#define XFEATURE_XTILEDATA 18 | ||
#define ARCH_REQ_XCOMP_PERM 0x1023 | ||
#define ROW_NUM 16 | ||
#define COL_NUM 64 | ||
#define BYTES_PER_VECTOR 1024 | ||
#define load_tile_reg(tmm_num, tile, stride) \ | ||
do { \ | ||
asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num \ | ||
: : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory") \ | ||
} while (0) | ||
|
||
struct __tile_config { | ||
uint8_t palette_id; | ||
uint8_t start_row; | ||
uint8_t reserved_0[14]; | ||
uint16_t colsb[8]; | ||
uint16_t reserved_1[8]; | ||
uint8_t rows[8]; | ||
uint8_t reserved_2[8]; | ||
}; | ||
|
||
union __union_tile_config { | ||
struct __tile_config s; | ||
uint8_t a[64]; | ||
}; | ||
|
||
struct thread_data { | ||
int8_t *input_x; | ||
int8_t *input_y; | ||
int32_t *output; | ||
int data_entries; | ||
}; | ||
|
||
static void init_tile_config(union __union_tile_config *dst, uint8_t rows, uint8_t colsb) | ||
{ | ||
int32_t i; | ||
|
||
dst->s.palette_id = 1; | ||
dst->s.start_row = 0; | ||
|
||
for (i = 0; i < 14; i++) | ||
dst->s.reserved_0[i] = 0; | ||
|
||
for (i = 0; i < 8; i++) { | ||
dst->s.reserved_1[i] = 0; | ||
dst->s.reserved_2[i] = 0; | ||
} | ||
|
||
for (i = 0; i < 8; i++) { | ||
dst->s.colsb[i] = colsb; | ||
dst->s.rows[i] = rows; | ||
} | ||
|
||
_tile_loadconfig(dst->a); | ||
} | ||
|
||
/* Set_tiledata_use() - Invoke syscall to set ARCH_SET_STATE_USE */ | ||
static void set_tiledata_use(void) | ||
{ | ||
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) | ||
printf("Fail to do XFEATURE_XTILEDATA\n"); | ||
} | ||
|
||
static void work(void *arg) | ||
{ | ||
int i; | ||
struct thread_data *dp = (struct thread_data *)arg; | ||
int entries = dp->data_entries; | ||
|
||
set_tiledata_use(); | ||
|
||
for (i = 0; i < entries; ++i) { | ||
_tile_loadd(2, dp->input_x + BYTES_PER_VECTOR * i, COL_NUM); | ||
_tile_loadd(3, dp->input_y + BYTES_PER_VECTOR * i, COL_NUM); | ||
_tile_loadd(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM); | ||
_tile_dpbssd(1, 2, 3); | ||
_tile_stored(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM); | ||
} | ||
} | ||
|
||
#include "worker_init_amx.c" | ||
#include "run_common.c" | ||
|
||
static struct workload w = { | ||
"AMX", | ||
init, | ||
cleanup, | ||
run, | ||
}; | ||
|
||
struct workload *register_AMX(void) | ||
{ | ||
return &w; | ||
} |
Oops, something went wrong.