From b1c14216add99261042ea8c1bff178a5129f3c65 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Sat, 29 Jun 2024 04:27:10 -0700 Subject: [PATCH 1/2] Add subgraph support for qb4w --- include/xnnpack.h | 6 ++++++ src/subgraph/fully-connected.c | 11 +++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index c3b81bf619d..6f8353a99dd 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -418,10 +418,16 @@ enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2( uint32_t* id_out); /// Define a blockwise quantized tensor-type Value and add it to a Subgraph. +<<<<<<< HEAD /// @param block_size - size of a block in the tensor with blockwise quantization parameters. Block is defined as /// number of input channel element per output channel. /// For Fully connected operators with 2d filters of size [output_channels, input_channels], /// expecting number of scale values to be = output_channels * (input_channels / block_size). +======= +/// @param block_size - size of a block in the tensor with blockwise quantization parameters. Block is defined as number of input channel element per output channel. +/// For Fully connected operators with 2d filters of size [output_channels, input_channels], expecting number of scale values to be, +/// = output_channels * (input_channels / block_size). +>>>>>>> b976ff159 (Add subgraph support for qb4w) enum xnn_status xnn_define_blockwise_quantized_tensor_value( xnn_subgraph_t subgraph, enum xnn_datatype datatype, diff --git a/src/subgraph/fully-connected.c b/src/subgraph/fully-connected.c index 04c4c74609f..f43a7a9b3e0 100644 --- a/src/subgraph/fully-connected.c +++ b/src/subgraph/fully-connected.c @@ -796,7 +796,12 @@ static inline enum xnn_compute_type validate_datatypes_with_bias( } break; case xnn_datatype_qbint4: - if (input_datatype == xnn_datatype_qdint8 && + if (input_datatype == xnn_datatype_fp32 && + bias_datatype == xnn_datatype_fp32 && + output_datatype == xnn_datatype_fp32) + { + return xnn_compute_type_fp32; + } else if (input_datatype == xnn_datatype_qdint8 && bias_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp32) { @@ -883,7 +888,9 @@ static inline enum xnn_compute_type validate_datatypes_without_bias( } break; case xnn_datatype_qbint4: - if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp32) { + if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp32) { + return xnn_compute_type_fp32; + } else if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp32) { return xnn_compute_type_qd8_to_fp32; } else if (input_datatype == xnn_datatype_qdint8 && output_datatype == xnn_datatype_fp16) { return xnn_compute_type_qd8_to_fp16; From fbe3d5b52b1031f84b984c93590f96f9d0c2475d Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Sun, 28 Jul 2024 03:50:44 -0700 Subject: [PATCH 2/2] Guard AVX512-VNNIGFNI kernel references in gemm-config behind XNNPACK_ENABLE_AVX512VNNIGFNI --- include/xnnpack.h | 6 ------ src/configs/gemm-config.c | 5 ++++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/xnnpack.h b/include/xnnpack.h index 6f8353a99dd..c3b81bf619d 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -418,16 +418,10 @@ enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2( uint32_t* id_out); /// Define a blockwise quantized tensor-type Value and add it to a Subgraph. -<<<<<<< HEAD /// @param block_size - size of a block in the tensor with blockwise quantization parameters. Block is defined as /// number of input channel element per output channel. /// For Fully connected operators with 2d filters of size [output_channels, input_channels], /// expecting number of scale values to be = output_channels * (input_channels / block_size). -======= -/// @param block_size - size of a block in the tensor with blockwise quantization parameters. Block is defined as number of input channel element per output channel. -/// For Fully connected operators with 2d filters of size [output_channels, input_channels], expecting number of scale values to be, -/// = output_channels * (input_channels / block_size). ->>>>>>> b976ff159 (Add subgraph support for qb4w) enum xnn_status xnn_define_blockwise_quantized_tensor_value( xnn_subgraph_t subgraph, enum xnn_datatype datatype, diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c index 59626ef93c2..28306797c01 100644 --- a/src/configs/gemm-config.c +++ b/src/configs/gemm-config.c @@ -1954,6 +1954,7 @@ static void init_qd8_f32_qb4w_gemm_config(void) { const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); // Zen4 has gfni but is slower and 8x16 works better on zen4. 14x16 is faster on Sapphire Rapids + #if XNN_ENABLE_AVX512VNNIGFNI if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni && cpuinfo_get_core(0)->uarch != cpuinfo_uarch_zen4) { qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm); qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(14)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm); @@ -1962,7 +1963,9 @@ static void init_qd8_f32_qb4w_gemm_config(void) { qd8_f32_qb4w_gemm_config.nr = 16; qd8_f32_qb4w_gemm_config.log2_kr = 3; qd8_f32_qb4w_gemm_config.planes = 2; - } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { + } else + #endif // XNN_ENABLE_AVX512VNNIGFNI + if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm); qd8_f32_qb4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(8)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm); qd8_f32_qb4w_gemm_config.init.f32_qb4w = xnn_init_f32_qb4w_minmax_avx512vnni_params;