Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QB4W Fully Connected Subgraph #6674

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions bench/qd8-f16-qb4w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,63 @@
// Specification: test/qd8-f16-qb4w-gemm-minmax.yaml
// Generator: tools/generate-gemm-test.py

#include <benchmark/benchmark.h>
#include "bench/gemm-benchmark.h"
#include "bench/utils.h"

#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
#include <xnnpack/gemm.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/microparams-init.h>
#include <xnnpack/pack.h>

#include <benchmark/benchmark.h>
#include "bench/gemm-benchmark.h"
#include "bench/utils.h"


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
static void qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__avx2,
xnn_init_f16_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/1, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__avx2)

static void qd8_f16_qb4w_gemm_minmax_ukernel_2x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x8c8__avx2,
xnn_init_f16_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/2, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f16_qb4w_gemm_minmax_ukernel_2x8c8__avx2)

static void qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__avx2,
xnn_init_f16_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/3, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__avx2)

static void qd8_f16_qb4w_gemm_minmax_ukernel_4x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x8c8__avx2,
xnn_init_f16_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/4, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f16_qb4w_gemm_minmax_ukernel_4x8c8__avx2)
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


static void qd8_f16_qb4w_gemm_minmax_ukernel_1x2__scalar(benchmark::State& state, const char* net) {
Expand Down
57 changes: 53 additions & 4 deletions bench/qd8-f32-qb4w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,63 @@
// Specification: test/qd8-f32-qb4w-gemm-minmax.yaml
// Generator: tools/generate-gemm-test.py

#include <benchmark/benchmark.h>
#include "bench/gemm-benchmark.h"
#include "bench/utils.h"

#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
#include <xnnpack/gemm.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/microparams-init.h>
#include <xnnpack/pack.h>

#include <benchmark/benchmark.h>
#include "bench/gemm-benchmark.h"
#include "bench/utils.h"


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
static void qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__avx2,
xnn_init_f32_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/1, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__avx2)

static void qd8_f32_qb4w_gemm_minmax_ukernel_2x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x8c8__avx2,
xnn_init_f32_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/2, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f32_qb4w_gemm_minmax_ukernel_2x8c8__avx2)

static void qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__avx2,
xnn_init_f32_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/3, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__avx2)

static void qd8_f32_qb4w_gemm_minmax_ukernel_4x8c8__avx2(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x8c8__avx2,
xnn_init_f32_qb4w_minmax_avx_params,
xnn_pack_qs8_qb4w_gemm_goi_w,
/*mr=*/4, /*nr=*/8, /*kr=*/8, /*sr=*/1,
benchmark::utils::CheckAVX2);
}

BENCHMARK_GEMM_BL(qd8_f32_qb4w_gemm_minmax_ukernel_4x8c8__avx2)
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


static void qd8_f32_qb4w_gemm_minmax_ukernel_1x2__scalar(benchmark::State& state, const char* net) {
Expand Down
8 changes: 8 additions & 0 deletions cmake/gen/avx2_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,10 @@ SET(ALL_AVX2_MICROKERNEL_SRCS
src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-div.c
src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-nr1.c
src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-nr1adj.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8c8-minmax-avx2.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-2x8c8-minmax-avx2.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-3x8c8-minmax-avx2.c
src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-4x8c8-minmax-avx2.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x8c8-minmax-avx2.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x8c8-minmax-avx2.c
src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-3x8c8-minmax-avx2.c
Expand All @@ -341,6 +345,10 @@ SET(ALL_AVX2_MICROKERNEL_SRCS
src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-2x8c8-minmax-avx2.c
src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-3x8c8-minmax-avx2.c
src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c8-minmax-avx2.c
src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x8c8-minmax-avx2.c
src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x8c8-minmax-avx2.c
src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-3x8c8-minmax-avx2.c
src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-4x8c8-minmax-avx2.c
src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c
src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c
src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c
Expand Down
8 changes: 8 additions & 0 deletions gen/avx2_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ ALL_AVX2_MICROKERNEL_SRCS = [
"src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-div.c",
"src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-nr1.c",
"src/math/gen/f32-tanh-avx2-expm1minus-rr1-p6h5ts-nr1adj.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8c8-minmax-avx2.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-2x8c8-minmax-avx2.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-3x8c8-minmax-avx2.c",
"src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-4x8c8-minmax-avx2.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-1x8c8-minmax-avx2.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-2x8c8-minmax-avx2.c",
"src/qd8-f16-qc4w-gemm/gen/qd8-f16-qc4w-gemm-3x8c8-minmax-avx2.c",
Expand All @@ -337,6 +341,10 @@ ALL_AVX2_MICROKERNEL_SRCS = [
"src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-2x8c8-minmax-avx2.c",
"src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-3x8c8-minmax-avx2.c",
"src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c8-minmax-avx2.c",
"src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x8c8-minmax-avx2.c",
"src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x8c8-minmax-avx2.c",
"src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-3x8c8-minmax-avx2.c",
"src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-4x8c8-minmax-avx2.c",
"src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-1x8c8-minmax-avx2.c",
"src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-2x8c8-minmax-avx2.c",
"src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c",
Expand Down
105 changes: 105 additions & 0 deletions include/xnnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ enum xnn_datatype {
/// Dynamically quantized 8-bit signed integers packed with their per-row
/// quantization parameters.
xnn_datatype_qpint8 = 10,
/// Quantized 4-bit signed integer with shared per-channel-block quantization
/// parametes.
xnn_datatype_qbint4 = 11,
};

/// Define a tensor-type Value and add it to a Subgraph.
Expand Down Expand Up @@ -407,6 +410,24 @@ enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2(
uint32_t flags,
uint32_t* id_out);

/// Define a blockwise quantized tensor-type Value and add it to a Subgraph.
/// @param block_size - size of a block in the tensor with blockwise quantization parameters. Block is defined as number of input channel element per output channel.
/// For Fully connected operators with 2d filters of size [output_channels, input_channels], expecting number of scale values to be,
/// = output_channels * (input_channels / block_size).
enum xnn_status xnn_define_blockwise_quantized_tensor_value(
xnn_subgraph_t subgraph,
enum xnn_datatype datatype,
int32_t zero_point,
const float* scale,
size_t num_dims,
size_t channel_dim,
size_t block_size,
const size_t* dims,
const void* data,
uint32_t external_id,
uint32_t flags,
uint32_t* id_out);

/// Define a dynamically quantized tensor-type Value and add it to a Subgraph.
///
/// @param subgraph - a Subgraph object that will own the created Value.
Expand Down Expand Up @@ -4317,6 +4338,90 @@ enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc4w(
size_t batch_size,
pthreadpool_t threadpool);

enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qb4w(
size_t input_channels,
size_t output_channels,
size_t input_stride,
size_t output_stride,
size_t block_size,
uint8_t kernel_zero_point,
const float* kernel_scale,
const void* kernel,
const float* bias,
float output_min,
float output_max,
uint32_t flags,
xnn_code_cache_t code_cache,
xnn_weights_cache_t weights_cache,
xnn_operator_t* fully_connected_op_out);

enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qb4w(
xnn_operator_t fully_connected_op,
const int8_t* input,
float* output,
const struct xnn_dynamic_quantization_params* quantization_params);

enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qb4w(
xnn_operator_t fully_connected_op,
size_t batch_size,
pthreadpool_t threadpool);

enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qb4w(
size_t input_channels,
size_t output_channels,
size_t input_stride,
size_t output_stride,
size_t block_size,
uint8_t kernel_zero_point,
const float* kernel_scale,
const void* kernel,
const float* bias,
float output_min,
float output_max,
uint32_t flags,
xnn_code_cache_t code_cache,
xnn_weights_cache_t weights_cache,
xnn_operator_t* fully_connected_op_out);

enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qb4w(
xnn_operator_t fully_connected_op,
const int8_t* input,
void* output,
const struct xnn_dynamic_quantization_params* quantization_params);

enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qb4w(
xnn_operator_t fully_connected_op,
size_t batch_size,
pthreadpool_t threadpool);

enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qb4w(
size_t input_channels,
size_t output_channels,
size_t input_stride,
size_t output_stride,
size_t block_size,
uint8_t kernel_zero_point,
const float* kernel_scale,
const void* kernel,
const float* bias,
float output_min,
float output_max,
uint32_t flags,
xnn_code_cache_t code_cache,
xnn_weights_cache_t weights_cache,
xnn_operator_t* fully_connected_op_out);

enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qb4w(
xnn_operator_t fully_connected_op,
const int8_t* input,
float* output,
const struct xnn_dynamic_quantization_params* quantization_params);

enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qb4w(
xnn_operator_t fully_connected_op,
size_t batch_size,
pthreadpool_t threadpool);

enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w(
size_t input_channels,
size_t output_channels,
Expand Down
10 changes: 10 additions & 0 deletions scripts/generate-qs8-gemm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -983,6 +983,16 @@ tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D DATATYPE=QC4_F32 -D AVX=2 -
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D DATATYPE=QC4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-3x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=4 -D DATATYPE=QC4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qc4w-gemm/gen/qd8-f32-qc4w-gemm-4x8c8-minmax-avx2.c &

tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D DATATYPE=QB4_F16 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-1x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D DATATYPE=QB4_F16 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-2x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D DATATYPE=QB4_F16 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-3x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=4 -D DATATYPE=QB4_F16 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f16-qb4w-gemm/gen/qd8-f16-qb4w-gemm-4x8c8-minmax-avx2.c &

tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D DATATYPE=QB4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-1x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D DATATYPE=QB4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-2x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D DATATYPE=QB4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-3x8c8-minmax-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=4 -D DATATYPE=QB4_F32 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION= -o src/qd8-f32-qb4w-gemm/gen/qd8-f32-qb4w-gemm-4x8c8-minmax-avx2.c &

tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D DATATYPE=QC8 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION=FP32 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D DATATYPE=QC8 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION=FP32 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-avx2.c &
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D DATATYPE=QC8 -D AVX=2 -D PREFETCH=0 -D REQUANTIZATION=FP32 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-avx2.c &
Expand Down
2 changes: 2 additions & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,11 @@ tools/generate-gemm-test.py --spec test/qu8-gemm-minmax-fp32.yaml --output-test
tools/generate-gemm-test.py --spec test/qu8-gemm-minmax-rndnu.yaml --output-test test/qu8-gemm-minmax-rndnu.cc --output-test test/qu8-gemm-minmax-rndnu-2.cc --output-bench bench/qu8-gemm-rndnu.cc &

tools/generate-gemm-test.py --spec test/qd8-f16-qc4w-gemm-minmax.yaml --output-test test/qd8-f16-qc4w-gemm-minmax.cc --output-test test/qd8-f16-qc4w-gemm-minmax-2.cc --output-test test/qd8-f16-qc4w-gemm-minmax-3.cc --output-test test/qd8-f16-qc4w-gemm-minmax-4.cc --output-bench bench/qd8-f16-qc4w-gemm.cc &
tools/generate-gemm-test.py --spec test/qd8-f16-qb4w-gemm-minmax.yaml --output-test test/qd8-f16-qb4w-gemm-minmax.cc --output-bench bench/qd8-f16-qb4w-gemm.cc &
tools/generate-gemm-test.py --spec test/qd8-f16-qc8w-gemm-minmax.yaml --output-test test/qd8-f16-qc8w-gemm-minmax.cc --output-test test/qd8-f16-qc8w-gemm-minmax-2.cc --output-test test/qd8-f16-qc8w-gemm-minmax-3.cc --output-test test/qd8-f16-qc8w-gemm-minmax-4.cc --output-bench bench/qd8-f16-qc8w-gemm.cc &
tools/generate-gemm-test.py --spec test/qd8-f32-qc8w-gemm-minmax.yaml --output-test test/qd8-f32-qc8w-gemm-minmax.cc --output-test test/qd8-f32-qc8w-gemm-minmax-2.cc --output-test test/qd8-f32-qc8w-gemm-minmax-3.cc --output-test test/qd8-f32-qc8w-gemm-minmax-4.cc --output-bench bench/qd8-f32-qc8w-gemm.cc &
tools/generate-gemm-test.py --spec test/qd8-f32-qc4w-gemm-minmax.yaml --output-test test/qd8-f32-qc4w-gemm-minmax.cc --output-test test/qd8-f32-qc4w-gemm-minmax-2.cc --output-test test/qd8-f32-qc4w-gemm-minmax-3.cc --output-test test/qd8-f32-qc4w-gemm-minmax-4.cc --output-bench bench/qd8-f32-qc4w-gemm.cc &
tools/generate-gemm-test.py --spec test/qd8-f32-qb4w-gemm-minmax.yaml --output-test test/qd8-f32-qb4w-gemm-minmax.cc --output-bench bench/qd8-f32-qb4w-gemm.cc &

tools/generate-gemm-test.py --spec test/qp8-f32-qc4w-gemm-minmax.yaml --output-test test/qp8-f32-qc4w-gemm-minmax.cc --output-bench bench/qp8-f32-qc4w-gemm.cc &

Expand Down
Loading