From ba8c79545555a4efe28198b2cd6b674c8972d843 Mon Sep 17 00:00:00 2001
From: Yi Sun <yi.sun@intel.com>
Date: Fri, 10 Nov 2023 16:27:28 +0800
Subject: [PATCH] Initialize workload of xsave.

Implement a series of micro workloads to experience the functionality of
xsave/xrestore in user space. These micro benchmarks are valuable for
performance tuning and debugging purposes.

Signed-off-by: Dongcheng Yan <dongcheng.yan@intel.com>
Signed-off-by: Yi Sun <yi.sun@intel.com>
---
 workload-xsave/.gitignore            |   4 +
 workload-xsave/Makefile              |  51 +++
 workload-xsave/README.md             |  60 +++
 workload-xsave/run_common.c          |  41 ++
 workload-xsave/start_test.sh         |  73 ++++
 workload-xsave/work_AMX.c            | 119 ++++++
 workload-xsave/work_AVX.c            |  67 ++++
 workload-xsave/work_AVX2.c           |  68 ++++
 workload-xsave/work_AVX512.c         |  87 +++++
 workload-xsave/work_DOTPROD.c        |  77 ++++
 workload-xsave/work_GETCPU.c         |  63 ++++
 workload-xsave/work_MEM.c            | 130 +++++++
 workload-xsave/work_PAUSE.c          |  39 ++
 workload-xsave/work_RDTSC.c          |  37 ++
 workload-xsave/work_SSE.c            |  66 ++++
 workload-xsave/work_TPAUSE.c         |  58 +++
 workload-xsave/work_UMWAIT.c         |  53 +++
 workload-xsave/work_VNNI.c           |  83 +++++
 workload-xsave/work_VNNI512.c        |  80 ++++
 workload-xsave/work_memcpy.c         | 123 ++++++
 workload-xsave/worker_init4.c        |  70 ++++
 workload-xsave/worker_init_amx.c     |  77 ++++
 workload-xsave/worker_init_avx.c     |  61 +++
 workload-xsave/worker_init_avx2.c    |  61 +++
 workload-xsave/worker_init_dotprod.c |  78 ++++
 workload-xsave/worker_init_sse.c     |  61 +++
 workload-xsave/yogini.c              | 537 +++++++++++++++++++++++++++
 workload-xsave/yogini.h              | 112 ++++++
 28 files changed, 2436 insertions(+)
 create mode 100644 workload-xsave/.gitignore
 create mode 100644 workload-xsave/Makefile
 create mode 100644 workload-xsave/README.md
 create mode 100644 workload-xsave/run_common.c
 create mode 100755 workload-xsave/start_test.sh
 create mode 100644 workload-xsave/work_AMX.c
 create mode 100644 workload-xsave/work_AVX.c
 create mode 100644 workload-xsave/work_AVX2.c
 create mode 100644 workload-xsave/work_AVX512.c
 create mode 100644 workload-xsave/work_DOTPROD.c
 create mode 100644 workload-xsave/work_GETCPU.c
 create mode 100644 workload-xsave/work_MEM.c
 create mode 100644 workload-xsave/work_PAUSE.c
 create mode 100644 workload-xsave/work_RDTSC.c
 create mode 100644 workload-xsave/work_SSE.c
 create mode 100644 workload-xsave/work_TPAUSE.c
 create mode 100644 workload-xsave/work_UMWAIT.c
 create mode 100644 workload-xsave/work_VNNI.c
 create mode 100644 workload-xsave/work_VNNI512.c
 create mode 100644 workload-xsave/work_memcpy.c
 create mode 100644 workload-xsave/worker_init4.c
 create mode 100644 workload-xsave/worker_init_amx.c
 create mode 100644 workload-xsave/worker_init_avx.c
 create mode 100644 workload-xsave/worker_init_avx2.c
 create mode 100644 workload-xsave/worker_init_dotprod.c
 create mode 100644 workload-xsave/worker_init_sse.c
 create mode 100644 workload-xsave/yogini.c
 create mode 100644 workload-xsave/yogini.h

diff --git a/workload-xsave/.gitignore b/workload-xsave/.gitignore
new file mode 100644
index 0000000..144a88f
--- /dev/null
+++ b/workload-xsave/.gitignore
@@ -0,0 +1,4 @@
+yogini
+*.o
+*.S
+result
diff --git a/workload-xsave/Makefile b/workload-xsave/Makefile
new file mode 100644
index 0000000..8a1acfa
--- /dev/null
+++ b/workload-xsave/Makefile
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2022 Intel Corporation.
+# Len Brown <len.brown@intel.com>
+# Yi Sun <yi.sun@intel.com>
+# Dongcheng Yan <dongcheng.yan@intel.com>
+
+CC		= $(CROSS_COMPILE)gcc
+BUILD_OUTPUT	:= $(CURDIR)
+PREFIX		?= /usr
+DESTDIR		?=
+DAY		:= $(shell date +%Y.%m.%d)
+DATE_TIME	:= $(shell date +%Y%m%d_%H%M%S)
+
+ifeq ("$(origin O)", "command line")
+	BUILD_OUTPUT := $(O)
+endif
+
+PROGS= yogini
+SRC= yogini.c work_AMX.c work_AVX.c work_AVX2.c work_AVX512.c work_VNNI512.c work_VNNI.c work_DOTPROD.c work_PAUSE.c work_TPAUSE.c work_UMWAIT.c work_RDTSC.c work_SSE.c work_MEM.c work_memcpy.c run_common.c worker_init4.c worker_init_dotprod.c worker_init_amx.c yogini.h
+OBJS= yogini.o work_AMX.o work_AVX.o work_AVX2.o work_AVX512.o work_VNNI512.o $(GCC11_OBJS) work_DOTPROD.o work_PAUSE.o work_TPAUSE.o work_UMWAIT.o work_RDTSC.o work_SSE.o work_MEM.o work_memcpy.o
+ASMS= work_AMX.S work_AVX.S work_AVX2.S work_AVX512.S work_VNNI512.S work_VNNI.S work_DOTPROD.S work_PAUSE.S work_TPAUSE.S work_UMWAIT.S work_RDTSC.S work_SSE.S work_MEM.S work_memcpy.S
+GCC11_OBJS=work_VNNI.o
+
+yogini : $(OBJS) $(ASMS)
+ifeq ($(DEBUG), 1)
+override CFLAGS +=      -march=sapphirerapids -g
+else
+override CFLAGS +=      -march=native
+endif
+override CFLAGS +=      -D_FORTIFY_SOURCE=2
+override CFLAGS +=      -Wall
+override CFLAGS +=      -O3
+override CFLAGS +=      -mtune=skylake-avx512
+#override CFLAGS +=     -mtune=alderlake
+override CFLAGS +=      -mavx512bf16
+
+LDFLAGS += -lm
+LDFLAGS += -lpthread
+
+%: %.c %.h
+	@mkdir -p $(BUILD_OUTPUT)
+	$(CC) $(CFLAGS) $(OBJS) -o $(BUILD_OUTPUT)/$@ $(LDFLAGS)
+
+%.S: %.c
+	@mkdir -p $(BUILD_OUTPUT)
+	$(CC) $(CFLAGS) -S $^ -o $(BUILD_OUTPUT)/$@
+
+.PHONY : clean
+clean :
+	@rm -f $(BUILD_OUTPUT)/yogini $(OBJS) $(ASMS)
diff --git a/workload-xsave/README.md b/workload-xsave/README.md
new file mode 100644
index 0000000..6c167d6
--- /dev/null
+++ b/workload-xsave/README.md
@@ -0,0 +1,60 @@
+# Intel SIMD Instruction Microbenchmark Suite
+
+The Intel SIMD Instruction Microbenchmark Suite is a collection of microbenchmarks designed to evaluate and debug various Intel SIMD instructions. The suite includes a total of 15 benchmarks covering instructions such as AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. These benchmarks are primarily used for debugging xsave/xrestor related issues.
+
+## Features
+* Provides a comprehensive set of microbenchmarks for Intel SIMD instructions.
+* Covers a range of instruction sets, including AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more.
+* Assists in debugging xsave/xrestor related problems.
+* Lightweight and easy to use.
+* Open-source and freely available.
+
+## Getting Started
+These instructions will help you get a copy of the Intel SIMD Instruction Microbenchmark Suite up and running on your local machine for development and testing purposes.
+
+### Prerequisites
+* C/C++ compiler with support for the desired SIMD instruction sets.
+* Make installed.
+
+### Installation
+Clone the repository to your local machine:
+```
+git clone https://github.com/intel-sandbox/workload-xsave.git
+```
+Enter the project directory:
+```
+cd workload-xsave
+```
+Build the benchmarks using CMake:
+```
+make
+```
+Or, if your platform is not support all SIMD feature, you can try
+```
+DEBUG=1 make
+```
+Run the benchmarks:
+```
+usage: ./yogini [OPTIONS]
+
+./yogini runs some simple micro workloads
+  -w, --workload [AVX,AVX2,AVX512,AMX,MEM,memcpy,SSE,VNNI,VNNI512,UMWAIT,TPAUSE,PAUSE,RDTSC]
+  -r, --repeat, each instance needs to be run
+  -b, --break_reason, [yield/sleep/trap/signal/futex]Available workloads:  AMX memcpy MEM SSE RDTSC PAUSE DOTPROD VNNI512 AVX512_BF16 AVX2 AVX
+
+```
+
+## Contributing
+Contributions are welcome and encouraged! If you would like to contribute to the Intel SIMD Instruction Microbenchmark Suite, please follow these steps:
+
+* License
+This project is licensed under the GPL2.0.
+* Architecture/Workflow
+![2023-07-07_15-03](https://github.com/intel-sandbox/workload-xsave/assets/1448148/1485edf4-91f5-4f46-ab34-a4eed9ff77f5)
+
+## Acknowledgments
+Mention any contributors or references you have used for this project.
+Contact
+For any questions or suggestions regarding the Intel SIMD Instruction Microbenchmark Suite, please contact [Yi Sun <yi.sun@intel.com>].
+
+Replace [your-email-address] with an appropriate contact email or remove this section if not necessary.
diff --git a/workload-xsave/run_common.c b/workload-xsave/run_common.c
new file mode 100644
index 0000000..3730738
--- /dev/null
+++ b/workload-xsave/run_common.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * generic worker code for re-use via inclusion
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#include <time.h>
+#include <stdint.h>
+#include "yogini.h"
+
+void thread_break(int32_t reason, uint32_t thread_idx);
+/*
+ * run()
+ * complete work in chunks of "data_entries" operations
+ * between each chunk, check the time
+ * return when requested operations complete, or out of time
+ *
+ * return operationds completed
+ */
+static unsigned long long run(struct work_instance *wi)
+{
+	unsigned int count;
+	unsigned int operations = wi->repeat;
+	struct thread_data *dp = wi->worker_data;
+
+	if (operations == 0)
+		operations = (~0U);
+
+	for (count = 0; count < operations; count++) {
+		thread_break(wi->break_reason, wi->thread_number);
+		/* each invocation of work() does "entries" operations */
+		work(dp);
+	}
+	unsigned long long tsc_now = rdtsc();
+	return tsc_now;
+}
diff --git a/workload-xsave/start_test.sh b/workload-xsave/start_test.sh
new file mode 100755
index 0000000..6544f5f
--- /dev/null
+++ b/workload-xsave/start_test.sh
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (c) 2022 Intel Corporation.
+# Yi Sun <yi.sun@intel.com>
+# Dongcheng Yan <dongcheng.yan@intel.com>
+
+#!/bin/bash
+option=""
+result=""
+repeat=""
+num=$#
+
+# store test results to a specified folder
+script_dir=$(dirname "$0")
+result_dir="$script_dir/result"
+
+if [ ! -d "$result_dir" ]; then
+  mkdir "$result_dir"
+fi
+
+# mode1: test workloads in specific break_reason
+test_single () {
+  echo "trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option"
+  trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option
+  if [ $? -ne 0 ]; then
+    echo "Failed to execute trace-cmd record."
+    exit 1
+  fi
+  trace-cmd report > "${result_dir}/${result}${break_reason}"
+}
+
+# mode2: test workloads in all break_reason
+test_all () {
+for ((i=1; i<=5; i++))
+do
+  break_reason=$i
+  test_single
+done
+}
+
+usage() {
+  GREEN='\033[0;32m'
+  NC='\033[0m'
+  echo -e "${GREEN}Usage:${NC}"
+  echo "first param:   break_reason"
+  echo "second param:  repeat_cnt"
+  echo "remain params: workload"
+  echo -e "${GREEN}Example:${NC}"
+  echo "$0 2 100 AVX MEM VNNI"
+  echo "$0 -1 100 AVX MEM VNNI"
+  echo -e "${GREEN}You can test all break_reason if first param is -1.${NC} "
+}
+
+# main func
+if [ $num -lt 3 ]; then
+  usage
+else
+  break_reason=$1
+  repeat=$2
+  for ((i=3; i<=$#; i++))
+  do
+  # add workload cmd
+    arg="${!i}"
+    option+="-w $arg "
+    result+="${arg}_"
+  done
+
+  if [ "$1" == "-1" ]; then
+    test_all "$@"
+  else
+    test_single "$@"
+  fi
+fi
diff --git a/workload-xsave/work_AMX.c b/workload-xsave/work_AMX.c
new file mode 100644
index 0000000..2afc722
--- /dev/null
+++ b/workload-xsave/work_AMX.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * work_AVX.c - offer the "AVX" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include "yogini.h"
+#include <err.h>
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+// #pragma GCC target("amx")
+#define WORKLOAD_NAME "AMX"
+#define XFEATURE_XTILEDATA 18
+#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ROW_NUM 16
+#define COL_NUM 64
+#define BYTES_PER_VECTOR		1024
+#define load_tile_reg(tmm_num, tile, stride)						\
+do {											\
+	asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num				\
+		     : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory")	\
+} while (0)
+
+struct __tile_config {
+	uint8_t palette_id;
+	uint8_t start_row;
+	uint8_t reserved_0[14];
+	uint16_t colsb[8];
+	uint16_t reserved_1[8];
+	uint8_t rows[8];
+	uint8_t reserved_2[8];
+};
+
+union __union_tile_config {
+	struct __tile_config s;
+	uint8_t a[64];
+};
+
+struct thread_data {
+	int8_t *input_x;
+	int8_t *input_y;
+	int32_t *output;
+	int data_entries;
+};
+
+static void init_tile_config(union __union_tile_config *dst, uint8_t rows, uint8_t colsb)
+{
+	int32_t i;
+
+	dst->s.palette_id = 1;
+	dst->s.start_row = 0;
+
+	for (i = 0; i < 14; i++)
+		dst->s.reserved_0[i] = 0;
+
+	for (i = 0; i < 8; i++) {
+		dst->s.reserved_1[i] = 0;
+		dst->s.reserved_2[i] = 0;
+	}
+
+	for (i = 0; i < 8; i++) {
+		dst->s.colsb[i] = colsb;
+		dst->s.rows[i] = rows;
+	}
+
+	_tile_loadconfig(dst->a);
+}
+
+/* Set_tiledata_use() - Invoke syscall to set ARCH_SET_STATE_USE */
+static void set_tiledata_use(void)
+{
+	if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA))
+		printf("Fail to do XFEATURE_XTILEDATA\n");
+}
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries;
+
+	set_tiledata_use();
+
+	for (i = 0; i < entries; ++i) {
+		_tile_loadd(2, dp->input_x + BYTES_PER_VECTOR * i, COL_NUM);
+		_tile_loadd(3, dp->input_y + BYTES_PER_VECTOR * i, COL_NUM);
+		_tile_loadd(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM);
+		_tile_dpbssd(1, 2, 3);
+		_tile_stored(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM);
+	}
+}
+
+#include "worker_init_amx.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"AMX",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_AMX(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_AVX.c b/workload-xsave/work_AVX.c
new file mode 100644
index 0000000..b06db08
--- /dev/null
+++ b/workload-xsave/work_AVX.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "AVX" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#pragma GCC target("avx")
+#pragma GCC optimize("unroll-loops")
+#define WORKLOAD_NAME "AVX"
+#define BITS_PER_VECTOR		256
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+struct thread_data {
+	float *input_x;
+	float *input_y;
+	float *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries / sizeof(double);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+		__m256 vx, vy, voutput;
+
+		vx = _mm256_loadu_ps(dp->input_x + i * DWORD_PER_VECTOR);
+		vy = _mm256_loadu_ps(dp->input_y + i * DWORD_PER_VECTOR);
+		voutput = _mm256_add_ps(vx, vy);
+		_mm256_storeu_ps(dp->output + i * DWORD_PER_VECTOR, voutput);
+	}
+}
+
+#include "run_common.c"
+#include "worker_init_avx.c"
+
+static struct workload w = {
+	"AVX",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_AVX(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_AVX2.c b/workload-xsave/work_AVX2.c
new file mode 100644
index 0000000..4f237d5
--- /dev/null
+++ b/workload-xsave/work_AVX2.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "AVX2" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#pragma GCC target("avx2,fma")
+#define WORKLOAD_NAME "AVX2"
+#define BITS_PER_VECTOR		256
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+#pragma GCC optimize("unroll-loops")
+
+struct thread_data {
+	u_int8_t *input_x;
+	int8_t *input_y;
+	int16_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries / sizeof(double);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+		__m256i vx, vy, voutput;
+
+		vx = _mm256_loadu_si256((__m256i *)(dp->input_x + i * BYTES_PER_VECTOR));
+		vy = _mm256_loadu_si256((__m256i *)(dp->input_y + i * BYTES_PER_VECTOR));
+		voutput = _mm256_maddubs_epi16(vx, vy);
+		_mm256_storeu_si256((__m256i *)(dp->output + i * WORDS_PER_VECTOR), voutput);
+	}
+}
+
+#include "worker_init_avx2.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"AVX2",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_AVX2(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_AVX512.c b/workload-xsave/work_AVX512.c
new file mode 100644
index 0000000..63542e4
--- /dev/null
+++ b/workload-xsave/work_AVX512.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "AVX512" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#if __GNUC__ >= 11
+
+#pragma GCC target("avx512bf16")
+#define WORKLOAD_NAME "AVX512"
+#define BITS_PER_VECTOR		512
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+#pragma GCC optimize("unroll-loops")
+
+struct thread_data {
+	u_int8_t *input_x;
+	int8_t *input_y;
+	int32_t *input_z;
+	int16_t *input_ones;
+	int32_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries / sizeof(double);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+
+		__m512 vx, vy, vz, voutput;
+
+		vx = _mm512_loadu_ps((float *)(dp->input_x + i * BYTES_PER_VECTOR));
+		vy = _mm512_loadu_ps((float *)(dp->input_y + i * BYTES_PER_VECTOR));
+		vz = _mm512_loadu_ps((float *)(dp->input_z + i * DWORD_PER_VECTOR));
+		__m512bh bvx = _mm512_cvtne2ps_pbh(vx, _mm512_setzero_ps());
+		__m512bh bvy = _mm512_cvtne2ps_pbh(vy, _mm512_setzero_ps());
+
+		voutput = _mm512_dpbf16_ps(vz, bvx, bvy);
+		_mm512_storeu_si512((__m512i *)(dp->output + i * BYTES_PER_VECTOR), _mm512_castps_si512(voutput));
+	}
+}
+
+#include "worker_init_dotprod.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"AVX512",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_AVX512(void)
+{
+	if (cpuid.avx512f)
+		return &w;
+
+	return NULL;
+}
+#else
+
+#warning GCC < 11 can not build work_AVX512.c
+
+#endif /* GCC < 11 */
diff --git a/workload-xsave/work_DOTPROD.c b/workload-xsave/work_DOTPROD.c
new file mode 100644
index 0000000..15a186b
--- /dev/null
+++ b/workload-xsave/work_DOTPROD.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#pragma GCC target("avx2,fma")
+
+#define BITS_PER_VECTOR		256
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR	(BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+#pragma GCC optimize("unroll-loops")
+
+struct thread_data {
+	u_int8_t *input_x;
+	int8_t *input_y;
+	int32_t *input_z;
+	int16_t *input_ones;
+	int32_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries;
+
+	__m256i v_ones;
+
+	v_ones = _mm256_loadu_si256((void *)dp->input_ones);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+
+		__m256i vx, vy, vz, voutput;
+		__m256i vtmp1, vtmp2;
+
+		vx = _mm256_loadu_si256((void *)(dp->input_x + i * BYTES_PER_VECTOR));
+		vy = _mm256_loadu_si256((void *)(dp->input_y + i * BYTES_PER_VECTOR));
+		vz = _mm256_loadu_si256((void *)(dp->input_z + i * DWORD_PER_VECTOR));
+
+		vtmp1 = _mm256_maddubs_epi16(vx, vy);	/* 8-bit mul, 16-bit add */
+		vtmp2 = _mm256_madd_epi16(vtmp1, v_ones);	/* 32-bit convert */
+		voutput = _mm256_add_epi32(vtmp2, vz);	/* 32-bit add */
+		_mm256_storeu_si256((void *)(dp->output + i * DWORD_PER_VECTOR), voutput);
+	}
+}
+
+#include "worker_init_dotprod.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"DOTPROD",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_DOTPROD(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_GETCPU.c b/workload-xsave/work_GETCPU.c
new file mode 100644
index 0000000..7a50243
--- /dev/null
+++ b/workload-xsave/work_GETCPU.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * see yogini.8
+ *
+ * Initial implementation is specific to Intel hardware.
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+
+extern unsigned int tsc_to_msec_from_start(unsigned long long tsc);
+
+/*
+ * GETCPU_run()
+ * Run the GETCPU spin loop either "loops" times, or until tsc_end
+ * return loops completed
+ */
+static unsigned long long GETCPU_run(struct work_instance *wi,
+				     unsigned long long loops, unsigned long long tsc_end)
+{
+	unsigned long long count;
+
+	if (loops == 0)
+		loops = (unsigned int)-1;
+
+	if (tsc_end == 0)
+		tsc_end = (unsigned long long)-1;
+
+	for (count = 0; count < loops; count++) {
+		unsigned long long tsc_now = rdtsc();
+		int cpu;
+
+		if (tsc_now >= tsc_end)
+			break;
+
+		cpu = record_cpu_residency(wi, tsc_to_msec_from_start(tsc_now));
+		record_wi_duration(wi, tsc_now);
+		record_cpu_work(wi, cpu, 1);
+	}
+
+	return (count);
+}
+
+static struct workload w = {
+	"GETCPU",
+	NULL,
+	NULL,
+	GETCPU_run,
+};
+
+struct workload *register_GETCPU(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_MEM.c b/workload-xsave/work_MEM.c
new file mode 100644
index 0000000..12a4119
--- /dev/null
+++ b/workload-xsave/work_MEM.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * see yogini.8
+ *
+ * Initial implementation is specific to Intel hardware.
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include "string.h"
+#include <err.h>
+#include <stdint.h>
+void thread_break(int32_t reason, uint32_t thread_idx);
+#define MEM_BYTES_PER_ITERATION (4 * 1024)
+
+struct thread_data {
+	char *buf1;
+	char *buf2;
+};
+
+static int init(struct work_instance *wi)
+{
+	struct thread_data *dp;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	/*
+	 * set default working set to equal l3 cache
+	 */
+	if (wi->wi_bytes == 0)
+		wi->wi_bytes = SIZE_1GB * 1024; /* small calibration buffer for high score */
+
+	if (wi->wi_bytes % (2 * MEM_BYTES_PER_ITERATION)) {
+		warnx("MEM: %d bytes is invalid working set size.\n", wi->wi_bytes);
+		errx(-1, "MEM: working-set size minimum of %dKB.\n",
+		     (2 * MEM_BYTES_PER_ITERATION) / 1024);
+	}
+	dp->buf1 = malloc(wi->wi_bytes / 2);
+	dp->buf2 = malloc(wi->wi_bytes / 2);
+
+	if (!dp->buf1 || !dp->buf2) {
+		perror("malloc");
+		exit(-1);
+	}
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->buf1);
+	free(dp->buf2);
+	free(dp);
+
+	wi->worker_data = NULL;
+
+	return 0;
+}
+
+static void *linux_memcpy(void *dest, const void *src, size_t n)
+{
+	long d0, d1, d2;
+
+	asm volatile ("rep ; movsq;\n\t movq %4,%%rcx;\n\t rep ; movsb\n\t"
+		      : "=&c" (d0), "=&D"(d1), "=&S"(d2)
+		      : "0"(n >> 3), "g"(n & 7), "1"(dest), "2"(src)
+		      : "memory");
+
+	return dest;
+}
+
+/*
+ * run()
+ * MEM bytes_to_copy, or until tsc_end
+ * return bytes copied
+ * use buf1 and buf2, in alternate directions
+ */
+static unsigned long long run(struct work_instance *wi)
+{
+	char *src, *dst;
+	unsigned long long bytes_done;
+	unsigned long long bytes_to_copy = wi->repeat * MEM_BYTES_PER_ITERATION;
+	struct thread_data *dp = wi->worker_data;
+
+	src = dp->buf1;
+	dst = dp->buf2;
+
+	for (bytes_done = 0;;) {
+		int kb;
+
+		for (kb = 0; kb < wi->wi_bytes / 1024 / 2; kb += 4) {
+			linux_memcpy(dst + kb * 1024, src + kb * 1024, MEM_BYTES_PER_ITERATION);
+
+			bytes_done += MEM_BYTES_PER_ITERATION;
+
+			thread_break(wi->break_reason, wi->thread_number);
+			if (bytes_to_copy && bytes_done >= bytes_to_copy)
+				goto done;
+		}
+	}
+done:
+	return rdtsc();
+}
+
+static struct workload MEM_workload = {
+	"MEM",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_MEM(void)
+{
+	return &MEM_workload;
+}
diff --git a/workload-xsave/work_PAUSE.c b/workload-xsave/work_PAUSE.c
new file mode 100644
index 0000000..d69fa5a
--- /dev/null
+++ b/workload-xsave/work_PAUSE.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "PAUSE" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include "yogini.h"
+#include <x86intrin.h>
+
+#define DATA_ENTRIES 1024
+
+static void work(void *arg)
+{
+	int i;
+
+	for (i = 0; i < DATA_ENTRIES; ++i)
+		asm volatile ("pause");
+}
+
+#include "run_common.c"
+
+static struct workload w = {
+	"PAUSE",
+	NULL,
+	NULL,
+	run,
+};
+
+struct workload *register_PAUSE(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_RDTSC.c b/workload-xsave/work_RDTSC.c
new file mode 100644
index 0000000..e32d27f
--- /dev/null
+++ b/workload-xsave/work_RDTSC.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "RDTSC" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+
+/*
+ * This "workload" runs the worker infrastructure without any inner work.
+ * As a result, it spends all of its time using RDTSC to end promptly.
+ */
+static inline void work(void *arg)
+{
+}
+
+#define DATA_ENTRIES 1
+
+#include "run_common.c"
+
+static struct workload w = {
+	"RDTSC",
+	NULL,
+	NULL,
+	run,
+};
+
+struct workload *register_RDTSC(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_SSE.c b/workload-xsave/work_SSE.c
new file mode 100644
index 0000000..b5b0157
--- /dev/null
+++ b/workload-xsave/work_SSE.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "SSE" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include "yogini.h"
+#include <emmintrin.h>
+#include <err.h>
+
+#pragma GCC target("sse4.2")
+#pragma GCC optimize("unroll-loops")
+#define WORKLOAD_NAME "SSE"
+#define BITS_PER_VECTOR		128
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+struct thread_data {
+	int32_t *input_x;
+	int32_t *input_y;
+	int32_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries / sizeof(double);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+		__m128i vx, vy, voutput;
+
+		vx = _mm_loadu_si128((__m128i *)dp->input_x);
+		vy = _mm_loadu_si128((__m128i *)dp->input_y);
+
+		voutput = _mm_add_epi32(vx, vy);
+		_mm_storeu_si128((__m128i *)dp->output, voutput);
+	}
+}
+
+#include "run_common.c"
+#include "worker_init_sse.c"
+
+static struct workload w = {
+	"SSE",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_SSE(void)
+{
+	return &w;
+}
diff --git a/workload-xsave/work_TPAUSE.c b/workload-xsave/work_TPAUSE.c
new file mode 100644
index 0000000..3c922cc
--- /dev/null
+++ b/workload-xsave/work_TPAUSE.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "TPAUSE" workload to yogini
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include "yogini.h"
+#include <x86intrin.h>
+
+#if __GNUC__ >= 9
+
+#pragma GCC target("waitpkg")
+
+#define WORKLOAD_NAME "TPAUSE"
+#define TPAUSE_TSC_CYCLES	((unsigned long long)(1000 * 1000))
+
+#define DATA_ENTRIES 1
+
+static void work(void *arg)
+{
+	unsigned int ctrl;
+	unsigned long long tsc;
+
+	ctrl = 0;
+	tsc = _rdtsc();
+	tsc += TPAUSE_TSC_CYCLES;
+
+	_tpause(ctrl, tsc);
+}
+
+#include "run_common.c"
+
+static struct workload w = {
+	"TPAUSE",
+	NULL,
+	NULL,
+	run,
+};
+
+struct workload *register_TPAUSE(void)
+{
+	if (cpuid.tpause)
+		return &w;
+
+	return NULL;
+}
+#else
+
+#warning GCC < 9 can not build work_TPAUSE.c
+
+#endif /* GCC < 9 */
diff --git a/workload-xsave/work_UMWAIT.c b/workload-xsave/work_UMWAIT.c
new file mode 100644
index 0000000..b726b54
--- /dev/null
+++ b/workload-xsave/work_UMWAIT.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "UMWAIT" workload to yogini
+ *
+ * Copyright (c) 2023 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include <sched.h>
+#include "yogini.h"
+#include <x86intrin.h>
+
+#if __GNUC__ >= 9
+
+#pragma GCC target("waitpkg")
+
+#define WORKLOAD_NAME "UMWAIT"
+
+#define DATA_ENTRIES 1
+
+static void work(void *arg)
+{
+	char dummy;
+
+	_umonitor(&dummy);
+	_umwait(0, (unsigned long long)-1);
+}
+
+#include "run_common.c"
+
+static struct workload w = {
+	"UMWAIT",
+	NULL,
+	NULL,
+	run,
+};
+
+struct workload *register_UMWAIT(void)
+{
+	if (cpuid.tpause)
+		return &w;
+
+	return NULL;
+}
+#else
+
+#warning GCC < 9 can not build work_UMWAIT.c
+
+#endif /* GCC < 9 */
diff --git a/workload-xsave/work_VNNI.c b/workload-xsave/work_VNNI.c
new file mode 100644
index 0000000..4ccf0ec
--- /dev/null
+++ b/workload-xsave/work_VNNI.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#if __GNUC__ >= 11
+
+#pragma GCC target("avxvnni")
+#define WORKLOAD_NAME "VNNI"
+#define BITS_PER_VECTOR		256
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+#pragma GCC optimize("unroll-loops")
+
+struct thread_data {
+	u_int8_t *input_x;
+	int8_t *input_y;
+	int32_t *input_z;
+	int16_t *input_ones;
+	int32_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries / sizeof(double);
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+		__m256i vx, vy, vz, voutput;
+
+		vx = _mm256_loadu_si256((void *)(dp->input_x + i * BYTES_PER_VECTOR));
+		vy = _mm256_loadu_si256((void *)(dp->input_y + i * BYTES_PER_VECTOR));
+		vz = _mm256_loadu_si256((void *)(dp->input_z + i * DWORD_PER_VECTOR));
+
+		voutput = _mm256_dpbusds_epi32(vz, vx, vy);
+
+		_mm256_storeu_si256((void *)(dp->output + i * DWORD_PER_VECTOR), voutput);
+	}
+}
+
+#include "worker_init_dotprod.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"VNNI",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_VNNI(void)
+{
+	if (cpuid.avx2vnni)
+		return &w;
+
+	return NULL;
+}
+#else
+
+#warning GCC < 11 can not build work_VNNI.c
+
+#endif /* GCC < 11 */
diff --git a/workload-xsave/work_VNNI512.c b/workload-xsave/work_VNNI512.c
new file mode 100644
index 0000000..1849da8
--- /dev/null
+++ b/workload-xsave/work_VNNI512.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include <immintrin.h>
+#include <err.h>
+
+#if __GNUC__ >= 9
+
+#pragma GCC target("avx512vnni")
+#define WORKLOAD_NAME "VNNI512"
+#define BITS_PER_VECTOR		512
+#define BYTES_PER_VECTOR	(BITS_PER_VECTOR / 8)
+#define WORDS_PER_VECTOR        (BITS_PER_VECTOR / 16)
+#define DWORD_PER_VECTOR	(BITS_PER_VECTOR / 32)
+
+#pragma GCC optimize("unroll-loops")
+
+struct thread_data {
+	u_int8_t *input_x;
+	int8_t *input_y;
+	int32_t *input_z;
+	int16_t *input_ones;
+	int32_t *output;
+	int data_entries;
+};
+
+static void work(void *arg)
+{
+	int i;
+	struct thread_data *dp = (struct thread_data *)arg;
+	int entries = dp->data_entries;
+
+	for (i = 0; i < entries; ++i) {
+		if (clfulsh) {
+			clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+			clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR);
+		}
+		__m512i vx, vy, vz, voutput;
+
+		vx = _mm512_loadu_si512((void *)(dp->input_x + i * BYTES_PER_VECTOR));
+		vy = _mm512_loadu_si512((void *)(dp->input_y + i * BYTES_PER_VECTOR));
+		vz = _mm512_loadu_si512((void *)(dp->input_z + i * DWORD_PER_VECTOR));
+
+		voutput = _mm512_dpbusds_epi32(vz, vx, vy);
+
+		_mm512_storeu_si512((void *)(dp->output + i * DWORD_PER_VECTOR), voutput);
+	}
+}
+
+#include "worker_init_dotprod.c"
+#include "run_common.c"
+
+static struct workload w = {
+	"VNNI512",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_VNNI512(void)
+{
+	if (cpuid.vnni512)
+		return &w;
+
+	return NULL;
+}
+#else
+
+#warning GCC < 9 can not build work_VNNI512.c
+
+#endif /* GCC < 9 */
diff --git a/workload-xsave/work_memcpy.c b/workload-xsave/work_memcpy.c
new file mode 100644
index 0000000..8dd91f5
--- /dev/null
+++ b/workload-xsave/work_memcpy.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * offer the "memcpy" workload to yogini
+ *
+ * see yogini.8
+ *
+ * Initial implementation is specific to Intel hardware.
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>		/* printf(3) */
+#include <stdlib.h>		/* random(3) */
+#include <sched.h>		/* CPU_SET */
+#include "yogini.h"
+#include "string.h"
+#include <err.h>
+#include <stdint.h>
+void thread_break(int32_t reason, uint32_t thread_idx);
+#define MEM_BYTES_PER_ITERATION (4 * 1024)
+
+struct thread_data {
+	char *buf1;
+	char *buf2;
+};
+
+static int init(struct work_instance *wi)
+{
+	struct thread_data *dp;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	/*
+	 * set default working set to equal l3 cache
+	 */
+
+	if (wi->wi_bytes == 0)
+		wi->wi_bytes = SIZE_1GB * 1024;	/* small calibration buffer for high score */
+
+	if (wi->wi_bytes % (2 * MEM_BYTES_PER_ITERATION)) {
+		warnx("memcpy: %d bytes is invalid working set size.\n", wi->wi_bytes);
+		errx(-1, "memcpy: requires multiple of %d KB.\n",
+		     (2 * MEM_BYTES_PER_ITERATION) / 1024);
+	}
+
+	dp->buf1 = malloc(wi->wi_bytes / 2);
+	dp->buf2 = malloc(wi->wi_bytes / 2);
+
+	if (!dp->buf1 || !dp->buf2) {
+		perror("malloc");
+		exit(-1);
+	}
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->buf1);
+	free(dp->buf2);
+	free(dp);
+
+	wi->worker_data = NULL;
+
+	return 0;
+}
+
+/*
+ * run()
+ * MEM bytes_to_copy, or until tsc_end
+ * return bytes copied
+ * use buf1 and buf2, in alternate directions
+ */
+static unsigned long long run(struct work_instance *wi)
+{
+	char *src, *dst;
+	unsigned long long bytes_done;
+	unsigned long long bytes_to_copy = wi->repeat * MEM_BYTES_PER_ITERATION;
+	struct thread_data *dp = wi->worker_data;
+
+	src = dp->buf1;
+	dst = dp->buf2;
+
+	for (bytes_done = 0;;) {
+		int kb;
+
+		for (kb = 0; kb < wi->wi_bytes / 1024 / 2; kb += 4) {
+			/* MEM 4KB */
+			memcpy(dst + kb * 1024, src + kb * 1024, MEM_BYTES_PER_ITERATION);
+
+			bytes_done += MEM_BYTES_PER_ITERATION;
+
+			thread_break(wi->break_reason, wi->thread_number);
+			if (bytes_to_copy && bytes_done >= bytes_to_copy)
+				goto done;
+		}
+	}
+done:
+	return rdtsc();
+}
+
+static struct workload memcpy_workload = {
+	"memcpy",
+	init,
+	cleanup,
+	run,
+};
+
+struct workload *register_memcpy(void)
+{
+	return &memcpy_workload;
+}
diff --git a/workload-xsave/worker_init4.c b/workload-xsave/worker_init4.c
new file mode 100644
index 0000000..1c0d53b
--- /dev/null
+++ b/workload-xsave/worker_init4.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * generic worker code for re-use via inclusion
+ *
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+#include <err.h>
+
+#include <stdlib.h>
+#include "yogini.h"
+
+static double get_random_double(void)
+{
+	unsigned long long random_int64;
+
+	random_int64 = (long long)random() | (((long long)random()) << 32);
+
+	return (double)random_int64;
+}
+
+static int init(struct work_instance *wi)
+{
+	int i;
+	struct thread_data *dp;
+	int bytes_per_entry = sizeof(double) * 4;	/* a[], x[], y[], z[] */
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	srand((int)time(0));
+
+	dp->a = malloc(entries * sizeof(double));
+	dp->x = malloc(entries * sizeof(double));
+	dp->y = malloc(entries * sizeof(double));
+	dp->z = malloc(entries * sizeof(double));
+	dp->data_entries = entries;
+
+	if (!dp->a || !dp->x || !dp->y || !dp->z)
+		errx(-1, "malloc failed");
+
+	for (i = 0; i < entries; ++i) {
+		dp->a[i] = get_random_double();
+		dp->x[i] = get_random_double();
+		dp->y[i] = get_random_double();
+		dp->z[i] = get_random_double();
+	}
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->a);
+	free(dp->x);
+	free(dp->y);
+	free(dp->z);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/worker_init_amx.c b/workload-xsave/worker_init_amx.c
new file mode 100644
index 0000000..08c9d68
--- /dev/null
+++ b/workload-xsave/worker_init_amx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+#include <stdio.h>
+#include <math.h>
+#include <stdint.h>
+#define YOGINI_MAIN
+#include "yogini.h"
+
+static void init_dword_tile(int8_t *ptr, uint8_t rows, uint8_t colsb, int entries)
+{
+	int32_t i, j, k;
+	int32_t cols = colsb / 4;
+
+	for (k = 0; k < entries; ++k) {
+		for (i = 0; i < rows; i++)
+			for (j = 0; j < cols; j++)
+				ptr[k * rows * cols + i * cols + j] = random();
+	}
+}
+
+static int init(struct work_instance *wi)
+{
+	struct thread_data *dp;
+	/* int8_t x[], y[], int32_t output[] */
+	int bytes_per_entry = BYTES_PER_VECTOR * 3;
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	union __union_tile_config cfg;
+
+	init_tile_config(&cfg, ROW_NUM, COL_NUM);
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	dp->input_x = (int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_x)
+		err(1, "calloc input_x");
+
+	dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_y)
+		err(1, "calloc input_y");
+
+	/* initialize input -- make every iteration the same for now */
+	init_dword_tile(dp->input_x, ROW_NUM, COL_NUM, entries);
+	init_dword_tile(dp->input_y, ROW_NUM, COL_NUM, entries);
+
+	dp->output = (int32_t *) calloc(entries, BYTES_PER_VECTOR);
+	if (dp->output == NULL)
+		err(1, "calloc output");
+
+	dp->data_entries = entries;
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->input_x);
+	free(dp->input_y);
+	free(dp->output);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/worker_init_avx.c b/workload-xsave/worker_init_avx.c
new file mode 100644
index 0000000..d2b4299
--- /dev/null
+++ b/workload-xsave/worker_init_avx.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+static int init(struct work_instance *wi)
+{
+	int i;
+	struct thread_data *dp;
+	int bytes_per_entry = BYTES_PER_VECTOR * 3;	/* x[], y[], output[] */
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	dp->input_x = (float *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_x)
+		err(1, "calloc input_x");
+
+	dp->input_y = (float *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_y)
+		err(1, "calloc input_y");
+
+	/* initialize input -- make every iteration the same for now */
+	for (i = 0; i < entries; ++i) {
+		int j;
+
+		for (j = 0; j < DWORD_PER_VECTOR; j++) {
+			int index = i * DWORD_PER_VECTOR + j;
+
+			dp->input_x[index] = j;
+			dp->input_y[index] = j;
+		}
+	}
+
+	dp->output = (float *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->output)
+		err(1, "calloc output");
+	dp->data_entries = entries;
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->input_x);
+	free(dp->input_y);
+	free(dp->output);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/worker_init_avx2.c b/workload-xsave/worker_init_avx2.c
new file mode 100644
index 0000000..03c0217
--- /dev/null
+++ b/workload-xsave/worker_init_avx2.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+static int init(struct work_instance *wi)
+{
+	int i;
+	struct thread_data *dp;
+	int bytes_per_entry = BYTES_PER_VECTOR * 3;	/* x[], y[], output[] */
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	dp->input_x = (u_int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_x)
+		err(1, "calloc input_x");
+
+	dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_y)
+		err(1, "calloc input_y");
+
+	/* initialize input -- make every iteration the same for now */
+	for (i = 0; i < entries; ++i) {
+		int j;
+
+		for (j = 0; j < BYTES_PER_VECTOR; j++) {
+			int index = i * BYTES_PER_VECTOR + j;
+
+			dp->input_x[index] = j;
+			dp->input_y[index] = BYTES_PER_VECTOR + j;
+		}
+	}
+
+	dp->output = (int16_t *) calloc(entries, BYTES_PER_VECTOR);
+	if (dp->output == NULL)
+		err(1, "calloc output");
+	dp->data_entries = entries;
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->input_x);
+	free(dp->input_y);
+	free(dp->output);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/worker_init_dotprod.c b/workload-xsave/worker_init_dotprod.c
new file mode 100644
index 0000000..1810870
--- /dev/null
+++ b/workload-xsave/worker_init_dotprod.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+static int init(struct work_instance *wi)
+{
+	int i;
+	struct thread_data *dp;
+	int bytes_per_entry = BYTES_PER_VECTOR * 4;/* x[], y[], z[] (ignores ones[]), output[] */
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	dp->input_x = (u_int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_x)
+		err(1, "calloc input_x");
+
+	dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_y)
+		err(1, "calloc input_y");
+
+	dp->input_z = (int32_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_z)
+		err(1, "calloc input_z");
+
+	dp->input_ones = (int16_t *)calloc(1, BYTES_PER_VECTOR);
+	if (!dp->input_ones)
+		err(1, "calloc input_ones");
+
+	/* initialize input -- make every iteration the same for now */
+	for (i = 0; i < entries; ++i) {
+		int j;
+
+		for (j = 0; j < BYTES_PER_VECTOR; j++) {
+			int index = i * BYTES_PER_VECTOR + j;
+
+			dp->input_x[index] = j;
+			dp->input_y[index] = BYTES_PER_VECTOR + j;
+		}
+		for (j = 0; j < DWORD_PER_VECTOR; j++) {
+			int index = i * DWORD_PER_VECTOR + j;
+
+			dp->input_z[index] = j;
+		}
+	}
+	for (i = 0; i < WORDS_PER_VECTOR; i++)
+		dp->input_ones[i] = 1;
+
+	dp->output = (int32_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->output)
+		err(1, "calloc output");
+	dp->data_entries = entries;
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->input_x);
+	free(dp->input_y);
+	free(dp->input_z);
+	free(dp->input_ones);
+	free(dp->output);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/worker_init_sse.c b/workload-xsave/worker_init_sse.c
new file mode 100644
index 0000000..2e06de6
--- /dev/null
+++ b/workload-xsave/worker_init_sse.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ */
+
+static int init(struct work_instance *wi)
+{
+	int i;
+	struct thread_data *dp;
+	int bytes_per_entry = BYTES_PER_VECTOR * 3;	/* x[], y[], output[] */
+	int entries;
+
+	entries = wi->wi_bytes / bytes_per_entry;
+
+	dp = (struct thread_data *)calloc(1, sizeof(struct thread_data));
+	if (!dp)
+		err(1, "thread_data");
+
+	dp->input_x = (int32_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_x)
+		err(1, "calloc input_x");
+
+	dp->input_y = (int32_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->input_y)
+		err(1, "calloc input_y");
+
+	/* initialize input -- make every iteration the same for now */
+	for (i = 0; i < entries; ++i) {
+		int j;
+
+		for (j = 0; j < DWORD_PER_VECTOR; j++) {
+			int index = i * DWORD_PER_VECTOR + j;
+
+			dp->input_x[index] = j;
+			dp->input_y[index] = j;
+		}
+	}
+
+	dp->output = (int32_t *)calloc(entries, BYTES_PER_VECTOR);
+	if (!dp->output)
+		err(1, "calloc output");
+	dp->data_entries = entries;
+
+	wi->worker_data = dp;
+
+	return 0;
+}
+
+static int cleanup(struct work_instance *wi)
+{
+	struct thread_data *dp = wi->worker_data;
+
+	free(dp->input_x);
+	free(dp->input_y);
+	free(dp->output);
+	free(dp);
+	wi->worker_data = NULL;
+
+	return 0;
+}
diff --git a/workload-xsave/yogini.c b/workload-xsave/yogini.c
new file mode 100644
index 0000000..8d49e68
--- /dev/null
+++ b/workload-xsave/yogini.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <err.h>
+#include <unistd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <time.h>
+#include <cpuid.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <x86intrin.h>
+#define YOGINI_MAIN
+#include "yogini.h"
+
+enum {
+	BREAK_BY_NOTHING = 0,
+	BREAK_BY_YIELD = 1,
+	BREAK_BY_SLEEP,
+	BREAK_BY_TRAP,
+	BREAK_BY_SIGNAL,
+	BREAK_BY_FUTEX,
+	BREAK_REASON_MAX = BREAK_BY_FUTEX
+} BREAK_REASON;
+#define FUTEX_VAL 0x5E5E5E5E
+
+int repeat_cnt;
+int clfulsh;
+char *progname;
+struct workload *all_workloads;
+struct work_instance *first_worker;
+struct work_instance *last_worker;
+
+static int num_worker_threads;
+static int num_checked_in_threads;
+static pthread_mutex_t checkin_mutex;
+static pthread_cond_t checkin_cv = PTHREAD_COND_INITIALIZER;
+int32_t break_reason = BREAK_BY_NOTHING;
+static int32_t *futex_ptr;
+static bool *thread_done;
+pthread_t *tid_ptr;
+
+unsigned int SIZE_1GB = 1024 * 1024 * 1024;
+
+struct cpuid cpuid;
+
+static void dump_command(int argc, char **argv)
+{
+	int i;
+
+	for (i = 0; i < argc; i++)
+		printf("%s ", argv[i]);
+
+	putchar('\n');
+}
+
+void dump_workloads(void)
+{
+	struct workload *wp;
+
+	for (wp = all_workloads; wp; wp = wp->next)
+		fprintf(stderr, " %s", wp->name);
+
+	fprintf(stderr, "\n");
+}
+
+static void help(void)
+{
+	fprintf(stderr,
+		"usage: %s [OPTIONS]\n"
+		"\n"
+		"%s runs some simple micro workloads\n"
+		"  -w, --workload [workload_name,threads#,break#, ...]\n", progname, progname);
+	fprintf(stderr, "Available workloads: ");
+	dump_workloads();
+	fprintf(stderr,
+		"  -r, --repeat, each instance needs to be run\n"
+		"  -b, --break_reason, [yield/sleep/trap/signal/futex]\n"
+		"For more help, see README\n");
+	exit(0);
+}
+
+int parse_break_cmd(char *input_string)
+{
+	if (strcmp(input_string, "sleep") == 0)
+		break_reason = BREAK_BY_SLEEP;
+	else if (strcmp(input_string, "yield") == 0)
+		break_reason = BREAK_BY_YIELD;
+	else if (strcmp(input_string, "trap") == 0)
+		break_reason = BREAK_BY_TRAP;
+	else if (strcmp(input_string, "signal") == 0)
+		break_reason = BREAK_BY_SIGNAL;
+	else if (strcmp(input_string, "futex") == 0)
+		break_reason = BREAK_BY_FUTEX;
+	else
+		return -1;
+	return 0;
+}
+
+static void set_tsc_per_sec(void)
+{
+	unsigned int ebx = 0, ecx = 0, edx = 0;
+	unsigned int max_level;
+
+	__cpuid(0, max_level, ebx, ecx, edx);
+
+	/* Structured Extended Feature Flags Enumeration Leaf */
+	if (max_level >= 0x7) {
+		unsigned int eax_subleaves;
+
+		eax_subleaves = 0;
+		ecx = 0;
+		edx = 0;
+
+		__cpuid_count(0x7, 0, eax_subleaves, ebx, ecx, edx);
+
+		if (ebx & (1 << 16))
+			cpuid.avx512f = 1;
+		if (ecx & (1 << 5))
+			cpuid.tpause = 1;
+		if (ecx & (1 << 11))
+			cpuid.vnni512 = 1;
+
+		if (eax_subleaves > 0) {
+			unsigned int eax = 0;
+
+			eax = ebx = ecx = edx = 0;
+			__cpuid_count(0x7, 1, eax, ebx, ecx, edx);
+			if (eax & (1 << 4))
+				cpuid.avx2vnni = 1;
+		}
+	}
+
+	if (max_level < 0x15)
+		errx(1, "sorry CPU too old: cpuid level 0x%x < 0x15", max_level);
+}
+
+void register_all_workloads(void)
+{
+	int i;
+	struct workload *wp;
+
+	for (i = 0; all_register_routines[i]; ++i) {
+		wp = all_register_routines[i] ();
+
+		if (!wp)
+			continue;
+
+		wp->next = all_workloads;
+		all_workloads = wp;
+	}
+}
+
+struct work_instance *alloc_new_work_instance(void)
+{
+	struct work_instance *wi;
+
+	wi = calloc(1, sizeof(struct work_instance));
+	if (!wi)
+		err(1, "work_instance");
+
+	wi->workload = all_workloads;	/* default workload is last probed */
+	return wi;
+}
+
+void register_new_worker(struct work_instance *wi)
+{
+	if (!first_worker)
+		first_worker = wi;
+	else
+		last_worker->next = wi;
+	last_worker = wi;
+
+	wi->next = NULL;
+}
+
+int parse_work_cmd(char *work_cmd)
+{
+	struct work_instance *wi;
+	struct workload *wp;
+
+	wp = find_workload(work_cmd);
+	if (wp) {
+		wi = alloc_new_work_instance();
+		wi->workload = wp;
+	} else {
+		fprintf(stderr, "Unrecognized work parameter '%s' try -h for help\n", work_cmd);
+		exit(1);
+	}
+
+	/* register this work_instance */
+	register_new_worker(wi);
+	return 0;
+}
+
+static void initial_ptr(void)
+{
+	futex_ptr = (int32_t *)malloc(sizeof(int32_t) * num_worker_threads);
+	thread_done = (bool *)malloc(sizeof(bool) * num_worker_threads);
+	tid_ptr = (pthread_t *)malloc(sizeof(pthread_t) * num_worker_threads);
+	if (!futex_ptr || !thread_done || !tid_ptr) {
+		printf("Fail to malloc memory for futex_ptr & tid_ptr\n");
+		exit(1);
+	}
+}
+
+static void initial_wi(void)
+{
+	struct work_instance *wi;
+
+	wi = first_worker;
+	while (wi) {
+		wi->break_reason = break_reason;
+		wi->wi_bytes = SIZE_1GB;
+		wi->repeat = repeat_cnt;
+		wi = wi->next;
+		num_worker_threads++;
+	}
+}
+
+static void deinitialize(void)
+{
+	struct work_instance *wi;
+	struct work_instance *cur;
+
+	wi = first_worker;
+	while (wi) {
+		cur = wi->next;
+		free(wi);
+		wi = cur;
+	}
+
+	free(futex_ptr);
+	free(thread_done);
+	free(tid_ptr);
+}
+
+static void cmdline(int argc, char **argv)
+{
+	int opt;
+	int option_index = 0;
+	static struct option long_options[] = {
+		{ "help", no_argument, 0, 'h' },
+		{ "work", required_argument, 0, 'w' },
+		{ "repeat", required_argument, 0, 'r' },
+		{ "break_reason", required_argument, 0, 'b' },
+		{"clflush", no_argument, 0, 'f'},
+		{ 0, 0, 0, 0 }
+	};
+
+	progname = argv[0];
+
+	while ((opt = getopt_long_only(argc, argv, "h:w:r:b:f",
+	       long_options, &option_index)) != -1) {
+		switch (opt) {
+		case 'h':
+			help();
+			break;
+		case 'w':
+			if (parse_work_cmd(optarg))
+				help();
+			break;
+		case 'r':
+			repeat_cnt = atoi(optarg);
+			break;
+		case 'b':
+			if (parse_break_cmd(optarg))
+				help();
+			break;
+		case 'f':
+			clfulsh = 1;
+			break;
+		default:
+			help();
+		}
+	}
+
+	dump_command(argc, argv);
+}
+
+static void initialize(int argc, char **argv)
+{
+	set_tsc_per_sec();
+	register_all_workloads();
+	cmdline(argc, argv);
+	initial_wi();
+	initial_ptr();
+}
+
+void clflush_range(void *address, size_t size)
+{
+	uintptr_t start = (uintptr_t)address;
+	uintptr_t end = start + size;
+
+	// Align according to the size of the cache line.
+	uintptr_t aligned_start = (start & ~(63UL));
+	uintptr_t aligned_end = (end + 63UL) & ~(63UL);
+
+	for (uintptr_t addr = aligned_start; addr < aligned_end; addr += 64)
+		_mm_clflush((void *)addr);
+
+	// Ensure clearing of unaligned portions.
+	for (uintptr_t addr = aligned_end; addr < end; addr++)
+		_mm_clflush((void *)addr);
+}
+
+static uint64_t do_syscall(uint64_t nr, uint64_t rdi, uint64_t rsi, uint64_t rdx,
+			   uint64_t r10, uint64_t r8, uint64_t r9)
+{
+	uint64_t rtn;
+
+	asm volatile("movq %0, %%rdi" : : "r"(rdi) : "%rdi");
+	asm volatile("movq %0, %%rsi" : : "r"(rsi) : "%rsi");
+	asm volatile("movq %0, %%rdx" : : "r"(rdx) : "%rdx");
+	asm volatile("movq %0, %%r10" : : "r"(r10) : "%r10");
+	asm volatile("movq %0, %%r8" : : "r"(r8) : "%r8");
+	asm volatile("movq %0, %%r9" : : "r"(r9) : "%r9");
+	asm volatile("syscall"
+		     : "=a" (rtn)
+		     : "a" (nr)
+		     : "rcx", "r11", "memory", "cc");
+
+	return rtn;
+}
+
+static void signal_handler(int32_t signum)
+{
+	//int32_t current_cpu = sched_getcpu();
+
+	//if (signum == SIGTRAP)
+		//printf("Break by trap, current_cpu=%d\n", current_cpu);
+
+	//if (signum == SIGUSR1)
+		//printf("Break by signal, current_cpu=%d\n", current_cpu);
+}
+
+void thread_break(int32_t reason, uint32_t thread_idx)
+{
+	struct timespec req;
+
+	switch (reason) {
+	case BREAK_BY_YIELD:
+		/*
+		 * Schedule out current thread by executing syscall
+		 * instruction with syscall number SYS_sched_yield
+		 */
+		do_syscall(SYS_sched_yield, 0, 0, 0, 0, 0, 0);
+		break;
+	case BREAK_BY_SLEEP:
+		/*
+		 * Schedule out current thread by executing syscall
+		 * instruction with syscall number SYS_nanosleep
+		 */
+		req.tv_sec = 1;
+		req.tv_nsec = 0;
+		do_syscall(SYS_nanosleep, (uint64_t)&req, 0, 0, 0, 0, 0);
+		break;
+	case BREAK_BY_TRAP:
+		/*
+		 * Trap is handled by the thread generated the trap,
+		 * Schedule out current thread by trap handling
+		 */
+		asm volatile("int3;");
+		break;
+	case BREAK_BY_SIGNAL:
+		/*
+		 * Do nothing, main thread send SIGUSR1 to sub thread periodically
+		 * Schedule out current thread by signal handling
+		 */
+		break;
+	case BREAK_BY_FUTEX:
+		/* Schedule out current thread by waiting futex */
+		do_syscall(SYS_futex, (uint64_t)&futex_ptr[thread_idx],
+			   FUTEX_WAIT, FUTEX_VAL, 0, 0, 0);
+		break;
+	}
+}
+
+static void worker_barrier(void)
+{
+	int i_am_last = 0;
+
+	pthread_mutex_lock(&checkin_mutex);
+
+	num_checked_in_threads += 1;
+	if (num_checked_in_threads == num_worker_threads)
+		i_am_last = 1;
+
+	pthread_mutex_unlock(&checkin_mutex);
+
+	if (i_am_last) {
+		pthread_cond_broadcast(&checkin_cv);
+	} else {
+		/* wait for all workers to checkin */
+		pthread_mutex_lock(&checkin_mutex);
+		while (num_checked_in_threads < num_worker_threads)
+			if (pthread_cond_wait(&checkin_cv, &checkin_mutex))
+				err(1, "cond_wait: checkin_cv");
+		pthread_mutex_unlock(&checkin_mutex);
+	}
+}
+
+static void *worker_main(void *arg)
+{
+	// cpu_set_t mask;
+	// CPU_ZERO(&mask);
+	// CPU_SET(1, &mask);
+	// pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+	struct work_instance *wi = (struct work_instance *)arg;
+
+	/* initialize data for this worker */
+	if (wi->workload->initialize)
+		wi->workload->initialize(wi);
+
+	worker_barrier();
+
+	printf("%s will repeat %u in reason %d\n",
+	       wi->workload->name, wi->repeat, wi->break_reason);
+
+	unsigned long long bgntsc, endtsc;
+
+	bgntsc = rdtsc();
+	endtsc = wi->workload->run(wi);
+	printf("Thread %d:%s took %llu clock-cycles, end in %llu.\n",
+	       wi->thread_number, wi->workload->name, endtsc - bgntsc, endtsc);
+
+	/* cleanup data for this worker */
+	if (wi->workload->cleanup)
+		wi->workload->cleanup(wi);
+
+	thread_done[wi->thread_number] = true;
+	pthread_exit((void *)0);
+	/* thread exit */
+}
+
+static void start_and_wait_for_workers(void)
+{
+	int i;
+	cpu_set_t mask;
+	struct work_instance *wi;
+	struct sigaction sigact;
+	bool all_thread_done = false;
+
+	CPU_ZERO(&mask);
+	CPU_SET(0, &mask);
+	pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+
+	if (break_reason == BREAK_BY_TRAP) {
+		sigact.sa_handler = signal_handler;
+		sigemptyset(&sigact.sa_mask);
+		sigact.sa_flags = 0;
+		sigaction(SIGTRAP, &sigact, NULL);
+	}
+
+	if (break_reason == BREAK_BY_SIGNAL) {
+		sigact.sa_handler = signal_handler;
+		sigemptyset(&sigact.sa_mask);
+		sigact.sa_flags = 0;
+		sigaction(SIGUSR1, &sigact, NULL);
+	}
+
+	/* create workers */
+	for (wi = first_worker, i = 0; !wi; wi = wi->next, i++) {
+		futex_ptr[i] = FUTEX_VAL;
+		thread_done[i] = false;
+		wi->thread_number = i;
+		pthread_attr_t attr;
+
+		pthread_attr_init(&attr);
+		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+		if (pthread_create(&tid_ptr[i], &attr, &worker_main, wi) != 0)
+			err(1, "pthread_create");
+
+		wi->thread_id = tid_ptr[i];
+	}
+
+	sleep(1);
+
+	if (break_reason == BREAK_BY_SIGNAL) {
+		while (!all_thread_done) {
+			all_thread_done = true;
+			for (i = 0; i < num_worker_threads; i++) {
+				if (!thread_done[i]) {
+					pthread_kill(tid_ptr[i], SIGUSR1);
+					all_thread_done = false;
+					/*
+					 * wait 0.5 second to prevent from
+					 * sending signal too frequently
+					 */
+					usleep(1);
+				}
+			}
+		}
+	}
+
+	/* Wake up the sub-thread waiting on a futex */
+	if (break_reason == BREAK_BY_FUTEX) {
+		while (!all_thread_done) {
+			all_thread_done = true;
+			for (i = 0; i < num_worker_threads; i++) {
+				if (!thread_done[i]) {
+					syscall(SYS_futex, &futex_ptr[i], FUTEX_WAKE, 1, 0, 0, 0);
+					all_thread_done = false;
+					/* wait 0.5 second to prevent from printing too much */
+					usleep(1);
+				}
+			}
+		}
+	}
+
+	/* wait for all workers to join */
+	for (wi = first_worker, i = 0; !wi; wi = wi->next, ++i)
+		if (pthread_join(tid_ptr[i], NULL) != 0)
+			err(0, "thread %ld failed to join\n", wi->thread_id);
+}
+
+int main(int argc, char **argv)
+{
+	initialize(argc, argv);
+	start_and_wait_for_workers();
+	deinitialize();
+}
diff --git a/workload-xsave/yogini.h b/workload-xsave/yogini.h
new file mode 100644
index 0000000..69a58ff
--- /dev/null
+++ b/workload-xsave/yogini.h
@@ -0,0 +1,112 @@
+ // SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ * Yi Sun <yi.sun@intel.com>
+ * Dongcheng Yan <dongcheng.yan@intel.com>
+ *
+ */
+
+#ifndef YOGINI_H
+#define YOGINI_H
+#include<string.h>
+#include <pthread.h>
+
+struct work_instance {
+	struct work_instance *next;
+	pthread_t thread_id;
+	int thread_number;
+	struct workload *workload;
+	void *worker_data;
+	unsigned int repeat;
+	unsigned int wi_bytes;
+	int break_reason;
+};
+
+struct workload {
+	char *name;
+	int (*initialize)(struct work_instance *wi);
+	int (*cleanup)(struct work_instance *wi);
+	unsigned long long (*run)(struct work_instance *wi);
+
+	struct workload *next;
+};
+
+extern struct workload *all_workloads;
+
+extern struct workload *register_GETCPU(void);
+extern struct workload *register_RDTSC(void);
+extern struct workload *register_AVX(void);
+extern struct workload *register_AVX2(void);
+extern struct workload *register_AVX512(void);
+extern struct workload *register_VNNI512(void);
+extern struct workload *register_VNNI(void);
+extern struct workload *register_DOTPROD(void);
+extern struct workload *register_PAUSE(void);
+extern struct workload *register_TPAUSE(void);
+extern struct workload *register_UMWAIT(void);
+extern struct workload *register_FP64(void);
+extern struct workload *register_SSE(void);
+extern struct workload *register_MEM(void);
+extern struct workload *register_memcpy(void);
+extern struct workload *register_AMX(void);
+
+extern unsigned int SIZE_1GB;
+
+#ifdef YOGINI_MAIN
+struct workload *(*all_register_routines[]) () = {
+	register_AVX,
+	register_AVX2,
+	register_AVX512,
+#if __GNUC__ >= 9
+	register_VNNI512,
+#endif
+#if __GNUC__ >= 11
+	register_VNNI,
+#endif
+	register_DOTPROD,
+	register_PAUSE,
+#if __GNUC__ >= 9
+	register_TPAUSE,
+	register_UMWAIT,
+#endif
+	register_RDTSC,
+	register_SSE,
+	register_MEM,
+	register_memcpy,
+	register_AMX,
+	NULL
+};
+#endif
+static inline struct workload *find_workload(char *name)
+{
+	struct workload *wp;
+
+	for (wp = all_workloads; wp; wp = wp->next) {
+		if (strcmp(name, wp->name) == 0)
+			return wp;
+	}
+	printf("Can't find this workload, please retry.\n");
+	return NULL;
+}
+
+static inline unsigned long long rdtsc(void)
+{
+	unsigned int low, high;
+
+	asm volatile ("rdtsc" : "=a" (low), "=d"(high));
+
+	return low | ((unsigned long long)high) << 32;
+}
+
+void clflush_range(void *address, size_t size);
+extern int clfulsh;
+struct cpuid {
+	unsigned int avx512f;
+	unsigned int vnni512;
+	unsigned int avx2vnni;
+	unsigned int tpause;
+};
+
+extern struct cpuid cpuid;
+#endif