Skip to content

Commit

Permalink
Initialize workload of xsave.
Browse files Browse the repository at this point in the history
Implement a series of micro workloads to experience the functionality of
xsave/xrestore in user space. These micro benchmarks are valuable for
performance tuning and debugging purposes.

Signed-off-by: Dongcheng Yan <[email protected]>
Signed-off-by: Yi Sun <[email protected]>
  • Loading branch information
ysun committed Nov 16, 2023
1 parent a4e7884 commit 4f4a8e1
Show file tree
Hide file tree
Showing 28 changed files with 2,436 additions and 0 deletions.
4 changes: 4 additions & 0 deletions workload-xsave/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
yogini
*.o
*.S
result
51 changes: 51 additions & 0 deletions workload-xsave/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# Copyright (c) 2022 Intel Corporation.
# Len Brown <[email protected]>
# Yi Sun <[email protected]>
# Dongcheng Yan <[email protected]>

CC = $(CROSS_COMPILE)gcc
BUILD_OUTPUT := $(CURDIR)
PREFIX ?= /usr
DESTDIR ?=
DAY := $(shell date +%Y.%m.%d)
DATE_TIME := $(shell date +%Y%m%d_%H%M%S)

ifeq ("$(origin O)", "command line")
BUILD_OUTPUT := $(O)
endif

PROGS= yogini
SRC= yogini.c work_AMX.c work_AVX.c work_AVX2.c work_AVX512.c work_VNNI512.c work_VNNI.c work_DOTPROD.c work_PAUSE.c work_TPAUSE.c work_UMWAIT.c work_RDTSC.c work_SSE.c work_MEM.c work_memcpy.c run_common.c worker_init4.c worker_init_dotprod.c worker_init_amx.c yogini.h
OBJS= yogini.o work_AMX.o work_AVX.o work_AVX2.o work_AVX512.o work_VNNI512.o $(GCC11_OBJS) work_DOTPROD.o work_PAUSE.o work_TPAUSE.o work_UMWAIT.o work_RDTSC.o work_SSE.o work_MEM.o work_memcpy.o
ASMS= work_AMX.S work_AVX.S work_AVX2.S work_AVX512.S work_VNNI512.S work_VNNI.S work_DOTPROD.S work_PAUSE.S work_TPAUSE.S work_UMWAIT.S work_RDTSC.S work_SSE.S work_MEM.S work_memcpy.S
GCC11_OBJS=work_VNNI.o

yogini : $(OBJS) $(ASMS)
ifeq ($(DEBUG), 1)
override CFLAGS += -march=sapphirerapids -g
else
override CFLAGS += -march=native
endif
override CFLAGS += -D_FORTIFY_SOURCE=2
override CFLAGS += -Wall
override CFLAGS += -O3
override CFLAGS += -mtune=skylake-avx512
#override CFLAGS += -mtune=alderlake
override CFLAGS += -mavx512bf16

LDFLAGS += -lm
LDFLAGS += -lpthread

%: %.c %.h
@mkdir -p $(BUILD_OUTPUT)
$(CC) $(CFLAGS) $(OBJS) -o $(BUILD_OUTPUT)/$@ $(LDFLAGS)

%.S: %.c
@mkdir -p $(BUILD_OUTPUT)
$(CC) $(CFLAGS) -S $^ -o $(BUILD_OUTPUT)/$@

.PHONY : clean
clean :
@rm -f $(BUILD_OUTPUT)/yogini $(OBJS) $(ASMS)
60 changes: 60 additions & 0 deletions workload-xsave/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Intel SIMD Instruction Microbenchmark Suite

The Intel SIMD Instruction Microbenchmark Suite is a collection of microbenchmarks designed to evaluate and debug various Intel SIMD instructions. The suite includes a total of 15 benchmarks covering instructions such as AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. These benchmarks are primarily used for debugging xsave/xrestor related issues.

## Features
* Provides a comprehensive set of microbenchmarks for Intel SIMD instructions.
* Covers a range of instruction sets, including AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more.
* Assists in debugging xsave/xrestor related problems.
* Lightweight and easy to use.
* Open-source and freely available.

## Getting Started
These instructions will help you get a copy of the Intel SIMD Instruction Microbenchmark Suite up and running on your local machine for development and testing purposes.

### Prerequisites
* C/C++ compiler with support for the desired SIMD instruction sets.
* Make installed.

### Installation
Clone the repository to your local machine:
```
git clone https://github.com/intel-sandbox/workload-xsave.git
```
Enter the project directory:
```
cd workload-xsave
```
Build the benchmarks using CMake:
```
make
```
Or, if your platform is not support all SIMD feature, you can try
```
DEBUG=1 make
```
Run the benchmarks:
```
usage: ./yogini [OPTIONS]
./yogini runs some simple micro workloads
-w, --workload [AVX,AVX2,AVX512,AMX,MEM,memcpy,SSE,VNNI,VNNI512,UMWAIT,TPAUSE,PAUSE,RDTSC]
-r, --repeat, each instance needs to be run
-b, --break_reason, [yield/sleep/trap/signal/futex]Available workloads: AMX memcpy MEM SSE RDTSC PAUSE DOTPROD VNNI512 AVX512_BF16 AVX2 AVX
```

## Contributing
Contributions are welcome and encouraged! If you would like to contribute to the Intel SIMD Instruction Microbenchmark Suite, please follow these steps:

* License
This project is licensed under the GPL2.0.
* Architecture/Workflow
![2023-07-07_15-03](https://github.com/intel-sandbox/workload-xsave/assets/1448148/1485edf4-91f5-4f46-ab34-a4eed9ff77f5)

## Acknowledgments
Mention any contributors or references you have used for this project.
Contact
For any questions or suggestions regarding the Intel SIMD Instruction Microbenchmark Suite, please contact [Yi Sun <[email protected]>].

Replace [your-email-address] with an appropriate contact email or remove this section if not necessary.
41 changes: 41 additions & 0 deletions workload-xsave/run_common.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* generic worker code for re-use via inclusion
*
* Copyright (c) 2022 Intel Corporation.
* Len Brown <[email protected]>
* Yi Sun <[email protected]>
* Dongcheng Yan <[email protected]>
*
*/

#include <time.h>
#include <stdint.h>
#include "yogini.h"

void thread_break(int32_t reason, uint32_t thread_idx);
/*
* run()
* complete work in chunks of "data_entries" operations
* between each chunk, check the time
* return when requested operations complete, or out of time
*
* return operationds completed
*/
static unsigned long long run(struct work_instance *wi)
{
unsigned int count;
unsigned int operations = wi->repeat;
struct thread_data *dp = wi->worker_data;

if (operations == 0)
operations = (~0U);

for (count = 0; count < operations; count++) {
thread_break(wi->break_reason, wi->thread_number);
/* each invocation of work() does "entries" operations */
work(dp);
}
unsigned long long tsc_now = rdtsc();
return tsc_now;
}
73 changes: 73 additions & 0 deletions workload-xsave/start_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# Copyright (c) 2022 Intel Corporation.
# Yi Sun <[email protected]>
# Dongcheng Yan <[email protected]>

#!/bin/bash
option=""
result=""
repeat=""
num=$#

# store test results to a specified folder
script_dir=$(dirname "$0")
result_dir="$script_dir/result"

if [ ! -d "$result_dir" ]; then
mkdir "$result_dir"
fi

# mode1: test workloads in specific break_reason
test_single () {
echo "trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option"
trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option
if [ $? -ne 0 ]; then
echo "Failed to execute trace-cmd record."
exit 1
fi
trace-cmd report > "${result_dir}/${result}${break_reason}"
}

# mode2: test workloads in all break_reason
test_all () {
for ((i=1; i<=5; i++))
do
break_reason=$i
test_single
done
}

usage() {
GREEN='\033[0;32m'
NC='\033[0m'
echo -e "${GREEN}Usage:${NC}"
echo "first param: break_reason"
echo "second param: repeat_cnt"
echo "remain params: workload"
echo -e "${GREEN}Example:${NC}"
echo "$0 2 100 AVX MEM VNNI"
echo "$0 -1 100 AVX MEM VNNI"
echo -e "${GREEN}You can test all break_reason if first param is -1.${NC} "
}

# main func
if [ $num -lt 3 ]; then
usage
else
break_reason=$1
repeat=$2
for ((i=3; i<=$#; i++))
do
# add workload cmd
arg="${!i}"
option+="-w $arg "
result+="${arg}_"
done

if [ "$1" == "-1" ]; then
test_all "$@"
else
test_single "$@"
fi
fi
119 changes: 119 additions & 0 deletions workload-xsave/work_AMX.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* work_AVX.c - offer the "AVX" workload to yogini
*
* Copyright (c) 2022 Intel Corporation.
* Yi Sun <[email protected]>
* Dongcheng Yan <[email protected]>
*
*/

#define _GNU_SOURCE
#include <stdio.h> /* printf(3) */
#include <stdlib.h> /* random(3) */
#include <sched.h> /* CPU_SET */
#include <sched.h>
#include <xmmintrin.h>
#include <immintrin.h>
#include "yogini.h"
#include <err.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <unistd.h>

// #pragma GCC target("amx")
#define WORKLOAD_NAME "AMX"
#define XFEATURE_XTILEDATA 18
#define ARCH_REQ_XCOMP_PERM 0x1023
#define ROW_NUM 16
#define COL_NUM 64
#define BYTES_PER_VECTOR 1024
#define load_tile_reg(tmm_num, tile, stride) \
do { \
asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num \
: : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory") \
} while (0)

struct __tile_config {
uint8_t palette_id;
uint8_t start_row;
uint8_t reserved_0[14];
uint16_t colsb[8];
uint16_t reserved_1[8];
uint8_t rows[8];
uint8_t reserved_2[8];
};

union __union_tile_config {
struct __tile_config s;
uint8_t a[64];
};

struct thread_data {
int8_t *input_x;
int8_t *input_y;
int32_t *output;
int data_entries;
};

static void init_tile_config(union __union_tile_config *dst, uint8_t rows, uint8_t colsb)
{
int32_t i;

dst->s.palette_id = 1;
dst->s.start_row = 0;

for (i = 0; i < 14; i++)
dst->s.reserved_0[i] = 0;

for (i = 0; i < 8; i++) {
dst->s.reserved_1[i] = 0;
dst->s.reserved_2[i] = 0;
}

for (i = 0; i < 8; i++) {
dst->s.colsb[i] = colsb;
dst->s.rows[i] = rows;
}

_tile_loadconfig(dst->a);
}

/* Set_tiledata_use() - Invoke syscall to set ARCH_SET_STATE_USE */
static void set_tiledata_use(void)
{
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA))
printf("Fail to do XFEATURE_XTILEDATA\n");
}

static void work(void *arg)
{
int i;
struct thread_data *dp = (struct thread_data *)arg;
int entries = dp->data_entries;

set_tiledata_use();

for (i = 0; i < entries; ++i) {
_tile_loadd(2, dp->input_x + BYTES_PER_VECTOR * i, COL_NUM);
_tile_loadd(3, dp->input_y + BYTES_PER_VECTOR * i, COL_NUM);
_tile_loadd(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM);
_tile_dpbssd(1, 2, 3);
_tile_stored(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM);
}
}

#include "worker_init_amx.c"
#include "run_common.c"

static struct workload w = {
"AMX",
init,
cleanup,
run,
};

struct workload *register_AMX(void)
{
return &w;
}
Loading

0 comments on commit 4f4a8e1

Please sign in to comment.