From 34f78311f41751efe1095a9f0c1034ed2f37cd5f Mon Sep 17 00:00:00 2001 From: "Durandin, Pavel" Date: Wed, 8 Jan 2025 21:17:14 +0400 Subject: [PATCH] [GPU] Fix oob in fc bf tiled for BMG --- .../cl_kernels/fully_connected_gpu_bf_tiled.cl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 8b351e133d83ad..4d255ebeff6ebd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -352,17 +352,18 @@ inline void FUNC(fc_bf_tiled_kernel_default)( // ===================================================================================================================================== // Main computation loop uint iterations = MAIN_LOOP_ELEMENTS_COUNT / (TILE_IFM * SIMD); + const uint B_PITCH_MAX = min((uint)((BATCH_SIZE-out_b) * (SIMD/INPUT0_TYPE_SIZE)), (uint)TILE_IN_B_PITCH); __attribute__((opencl_unroll_hint(1))) for (uint ni = 0; ni < iterations; ++ni) { // Load input. #define LOAD_IN_0(bi) do { \ in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \ - input_offset += TILE_IN_B_PITCH; \ + input_offset += B_PITCH_MAX; \ } while (false) CONST_LOOP(TILE_B, LOAD_IN_0); #undef LOAD_IN_0 - input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B; + input_offset += TILE_IFM * SIMD - B_PITCH_MAX * TILE_B; // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, // but significantly degrades readability and generality of code. // It doesn't also show noticable performance improvement on tested configurations.