Skip to content

Commit 552458d

Browse files
[GPU] Support FP32 output for FC gemv for performance issue (#32710)
+ Modify code to allow FP32 output type for FC gemv ### Description of the issue - *By , In the Gemma3-1b model, the Fully Connected (FC) layer was originally expected to use the GEMV kernel. However, due to [PR: Disable FP16 Compression for specific RMS patterns](#32414), some FC's output was changed to FP32, which caused the kernel selector to choose the fc_bf_tiled kernel instead of fc_gemv, resulting in performance degradation.* - *the current FC kernel selector was configured to only allow FP16 output for gemv, which led to the fallback to the less efficient fc_bf_tiled kernel.* - *After modifying the kernel selector to allow GEMV to be selected even when the output data type is FP32, the performance degradation issue was resolved.* #### problematic graphs <img width="651" height="300" alt="image" src="https://github.com/user-attachments/assets/f6d3d571-db8a-4f94-ade5-a8c028724920" /> #### Reproduction step and snapshot - Reproduced by benchmark `python benchmark.py -d GPU -m models/WW43_llm-optimum_2025.4.0-20264/gemma-3-1b-it/pytorch/ov/OV_FP16-4BIT_DEFAULT/ -pf repo-prompts/32_1024/gemma-3-1b-it.jsonl -n 1 --genai -mc 1 -ic 128 --apply_chat_template` #### Checklist - [x] Is it a proper fix? - [X] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Passed llm_bench ### Tickets: - *CVS-175846*
1 parent 9b61c6d commit 552458d

File tree

2 files changed

+107
-37
lines changed

2 files changed

+107
-37
lines changed

src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ ParamsKey FullyConnected_GEMV::GetSupportedKey() const {
1717
ParamsKey k;
1818
k.EnableInputDataType(Datatype::F16);
1919
k.EnableOutputDataType(Datatype::F16);
20+
k.EnableOutputDataType(Datatype::F32);
2021
k.EnableInputWeightsType(WeightsType::INT4);
2122
k.EnableInputWeightsType(WeightsType::UINT4);
2223
k.EnableInputLayout(DataLayout::bf);
@@ -64,7 +65,7 @@ bool FullyConnected_GEMV::Validate(const Params& params) const {
6465
}
6566

6667
// Data type re-check: only support f16:int4:f16
67-
if (input.GetDType() != Datatype::F16 || output.GetDType() != Datatype::F16 ||
68+
if (input.GetDType() != Datatype::F16 || (output.GetDType() != Datatype::F16 && output.GetDType() != Datatype::F32) ||
6869
(weights.GetDType() != WeightsType::INT4 && weights.GetDType() != WeightsType::UINT4)) {
6970
DO_NOT_USE_THIS_KERNEL(params.layerID);
7071
}

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

Lines changed: 105 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,7 +1996,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
19961996
long int batch_num,
19971997
long int scales_group_size = 128,
19981998
bool is_uint4 = false,
1999-
bool is_wei_dyn = false) {
1999+
bool is_wei_dyn = false,
2000+
bool is_output_fp16 = true) {
20002001
tests::random_generator rg(GET_SUITE_NAME);
20012002
auto& engine = get_test_engine();
20022003
auto supports_immad = engine.get_device_info().supports_immad;
@@ -2045,7 +2046,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
20452046
"bias",
20462047
"scale",
20472048
dcomp_zp_name,
2048-
data_types::f16,
2049+
is_output_fp16 ? data_types::f16 : data_types::f32,
20492050
2,
20502051
2);
20512052

@@ -2127,14 +2128,26 @@ class fully_connected_gpu_tests: public ::testing::Test {
21272128
ASSERT_TRUE(false);
21282129
}
21292130

2130-
auto output_mem = outputs.begin()->second.get_memory();
2131-
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
2131+
if (is_output_fp16) {
2132+
auto output_mem = outputs.begin()->second.get_memory();
2133+
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
21322134

2133-
auto ref_output_mem = get_ref_results();
2134-
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2135+
auto ref_output_mem = get_ref_results();
2136+
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2137+
2138+
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2139+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 30.0) << "i = " << i;
2140+
}
2141+
} else {
2142+
auto output_mem = outputs.begin()->second.get_memory();
2143+
cldnn::mem_lock<float> output_ptr(output_mem, get_test_stream());
2144+
2145+
auto ref_output_mem = get_ref_results();
2146+
cldnn::mem_lock<float> output_ptr_ref(ref_output_mem, get_test_stream());
21352147

2136-
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2137-
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 30.0) << "i = " << i;
2148+
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2149+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 30.0) << "i = " << i;
2150+
}
21382151
}
21392152
}
21402153

@@ -2143,7 +2156,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
21432156
long int batch_num,
21442157
long int scales_group_size = 128,
21452158
bool is_uint4 = false,
2146-
bool is_wei_dyn = false) {
2159+
bool is_wei_dyn = false,
2160+
bool is_output_fp16 = true) {
21472161
tests::random_generator rg(GET_SUITE_NAME);
21482162
auto& engine = get_test_engine();
21492163
auto supports_immad = engine.get_device_info().supports_immad;
@@ -2192,7 +2206,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
21922206
"",
21932207
"scale",
21942208
"dcomp_zp",
2195-
data_types::f16,
2209+
is_output_fp16 ? data_types::f16 : data_types::f32,
21962210
2,
21972211
2);
21982212

@@ -2272,22 +2286,35 @@ class fully_connected_gpu_tests: public ::testing::Test {
22722286
ASSERT_TRUE(false);
22732287
}
22742288

2275-
auto output_mem = outputs.begin()->second.get_memory();
2276-
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
2289+
if (is_output_fp16) {
2290+
auto output_mem = outputs.begin()->second.get_memory();
2291+
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
22772292

2278-
auto ref_output_mem = get_ref_results();
2279-
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2293+
auto ref_output_mem = get_ref_results();
2294+
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2295+
2296+
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2297+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 10.0) << "i = " << i;
2298+
}
2299+
} else {
2300+
auto output_mem = outputs.begin()->second.get_memory();
2301+
cldnn::mem_lock<float> output_ptr(output_mem, get_test_stream());
2302+
2303+
auto ref_output_mem = get_ref_results();
2304+
cldnn::mem_lock<float> output_ptr_ref(ref_output_mem, get_test_stream());
22802305

2281-
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2282-
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 10.0) << "i = " << i;
2306+
for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) {
2307+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 10.0) << "i = " << i;
2308+
}
22832309
}
22842310
}
22852311

22862312
void test_compressed_int4_scale_activation_gemv(bool is_caching_test,
22872313
bool is_dynamic,
22882314
long int batch_num,
22892315
long int scales_group_size = 128,
2290-
bool is_wei_dyn = false) {
2316+
bool is_wei_dyn = false,
2317+
bool is_output_fp16 = true) {
22912318
tests::random_generator rg(GET_SUITE_NAME);
22922319
auto& engine = get_test_engine();
22932320
auto supports_immad = engine.get_device_info().supports_immad;
@@ -2335,7 +2362,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
23352362
"bias",
23362363
"scale",
23372364
dcomp_zp_name,
2338-
data_types::f16,
2365+
is_output_fp16? data_types::f16 : data_types::f32,
23392366
2,
23402367
2);
23412368

@@ -2411,20 +2438,32 @@ class fully_connected_gpu_tests: public ::testing::Test {
24112438
}
24122439
}
24132440

2414-
auto output_mem = outputs.begin()->second.get_memory();
2415-
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
2441+
if (is_output_fp16) {
2442+
auto output_mem = outputs.begin()->second.get_memory();
2443+
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
24162444

2417-
auto ref_output_mem = get_ref_results();
2418-
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2445+
auto ref_output_mem = get_ref_results();
2446+
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
24192447

2420-
for (size_t i = 0; i < output_ptr_ref.size(); i++)
2421-
ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2448+
for (size_t i = 0; i < output_ptr_ref.size(); i++)
2449+
ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2450+
} else {
2451+
auto output_mem = outputs.begin()->second.get_memory();
2452+
cldnn::mem_lock<float> output_ptr(output_mem, get_test_stream());
2453+
2454+
auto ref_output_mem = get_ref_results();
2455+
cldnn::mem_lock<float> output_ptr_ref(ref_output_mem, get_test_stream());
2456+
2457+
for (size_t i = 0; i < output_ptr_ref.size(); i++)
2458+
ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2459+
}
24222460
}
24232461

24242462
void test_compressed_int4_scale_large_n_gemv(bool is_caching_test,
24252463
bool is_dynamic,
24262464
long int batch_num,
2427-
bool is_dyn_quan = false) {
2465+
bool is_dyn_quan = false,
2466+
bool is_output_fp16 = true) {
24282467
tests::random_generator rg(GET_SUITE_NAME);
24292468
auto& engine = get_test_engine();
24302469

@@ -2447,7 +2486,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
24472486
auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -1.0f, 1.0f);
24482487
set_values(input_mem, input_data);
24492488

2450-
auto weigths_data = rg.generate_random_1d<uint8_t>(ofm_num * ifm_num / 2, 0, 10);
2489+
auto weigths_data = rg.generate_random_1d<uint8_t>(ofm_num * ifm_num / 2, 0, 5);
24512490
set_values(weights_mem, weigths_data);
24522491

24532492
auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -1.0f, 1.0f);
@@ -2470,7 +2509,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
24702509
"",
24712510
"scale",
24722511
dcomp_zp_name,
2473-
data_types::f16,
2512+
is_output_fp16? data_types::f16 : data_types::f32,
24742513
3,
24752514
2);
24762515

@@ -2496,9 +2535,6 @@ class fully_connected_gpu_tests: public ::testing::Test {
24962535
network.set_input_data("input", input_mem);
24972536

24982537
auto outputs = network.execute();
2499-
// for (size_t i = 0; i < 100; i++) {
2500-
// outputs = network.execute();
2501-
// }
25022538
OPENVINO_ASSERT(outputs.size() == 1);
25032539
OPENVINO_ASSERT(outputs.begin()->first == "fc_prim");
25042540

@@ -2559,14 +2595,26 @@ class fully_connected_gpu_tests: public ::testing::Test {
25592595
ASSERT_EQ(outputs.size(), size_t(1));
25602596
ASSERT_EQ(outputs.begin()->first, "fc_prim");
25612597

2562-
auto output_mem = outputs.begin()->second.get_memory();
2563-
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
2598+
if (is_output_fp16) {
2599+
auto output_mem = outputs.begin()->second.get_memory();
2600+
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
25642601

2565-
auto ref_output_mem = get_ref_results();
2566-
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
2602+
auto ref_output_mem = get_ref_results();
2603+
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_mem, get_test_stream());
25672604

2568-
for (size_t i = 0; i < output_ptr_ref.size(); i++) {
2569-
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2605+
for (size_t i = 0; i < output_ptr_ref.size(); i++) {
2606+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2607+
}
2608+
} else {
2609+
auto output_mem = outputs.begin()->second.get_memory();
2610+
cldnn::mem_lock<float> output_ptr(output_mem, get_test_stream());
2611+
2612+
auto ref_output_mem = get_ref_results();
2613+
cldnn::mem_lock<float> output_ptr_ref(ref_output_mem, get_test_stream());
2614+
2615+
for (size_t i = 0; i < output_ptr_ref.size(); i++) {
2616+
EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
2617+
}
25702618
}
25712619
}
25722620

@@ -5053,6 +5101,27 @@ TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_dynamic_batch) {
50535101
this->test_compressed_int4_scale_dynamic_batch_gemv(false, 128, false);
50545102
}
50555103

5104+
// Test for fp32 output
5105+
TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_b1g32_f32) {
5106+
this->test_compressed_int4_scale_gemv(false, true, 1, 32, false, false, false);
5107+
}
5108+
5109+
TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_b1g32_f32) {
5110+
this->test_compressed_int4_scale_gemv(false, false, 1, 32, false, false, false);
5111+
}
5112+
5113+
TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_relu_b1g128_f32) {
5114+
this->test_compressed_int4_scale_activation_gemv(false, false, 1, 128, false, false);
5115+
}
5116+
5117+
TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_b1_f32) {
5118+
this->test_compressed_int4_scale_large_n_gemv(false, false, 1, false, false);
5119+
}
5120+
5121+
TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dynamic_b1_f32) {
5122+
this->test_compressed_int4_scale_large_n_gemv(false, true, 1, false, false);
5123+
}
5124+
50565125
// Test weight zp for INT8 ASYM
50575126
TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large_input_1025) {
50585127
this->test_comp_weight_scale_zp(true, 1025, 1792, 4608, 128, 128, 1, WzpMode::AsymmetricScalar, WeightMode::Bit8, TargetDevice::SkipDgpu);

0 commit comments

Comments
 (0)