Skip to content

Commit

Permalink
Merge branch 'master' into feature/cpu_random_uniform_alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
mlukasze authored Oct 23, 2024
2 parents 34d086c + 2fc59f5 commit 77d5c60
Show file tree
Hide file tree
Showing 13 changed files with 112 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def process_coveo_meta(meta, url, link):
namespace_element = ET.SubElement(url, namespace)

for tag_name, tag_value in values.items():
if tag_name == 'ovcategory':
if tag_name == 'ovdoctype':
processed_link = process_link(link)
ET.SubElement(namespace_element, tag_name).text = processed_link
else:
Expand Down
2 changes: 2 additions & 0 deletions docs/sphinx_setup/_static/js/custom.js
Original file line number Diff line number Diff line change
Expand Up @@ -417,13 +417,15 @@ document.addEventListener('DOMContentLoaded', function () {
await searchInterfaceSa.initialize({
accessToken: "xx1f2aebd3-4307-4632-aeea-17c13378b237",
organizationId: "intelcorporationnonproduction2ybdyblf7",
organizationEndpoints: await searchInterface.getOrganizationEndpoints('intelcorporationnonproduction2ybdyblf7')
});
searchInterfaceSa.executeFirstSearch();
}
if (searchInterface) {
await searchInterface.initialize({
accessToken: "xx1f2aebd3-4307-4632-aeea-17c13378b237",
organizationId: "intelcorporationnonproduction2ybdyblf7",
organizationEndpoints: await searchInterface.getOrganizationEndpoints('intelcorporationnonproduction2ybdyblf7')
});
searchInterface.executeFirstSearch();
}
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx_setup/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
ov_sitemap_meta = [
('coveo:metadata', {
'ovversion': version_name,
'ovcategory': 'null'
'ovdoctype': 'null'
})
]

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/thirdparty/ComputeLibrary
Submodule ComputeLibrary updated 143 files
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

// Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled
// Calculate weight : w = (w - dzp) * ds
// if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO.
#if DECOMPRESSION_ZP_TERM
#if DECOMPRESSION_ZP_SCALAR
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE);
Expand All @@ -976,8 +977,6 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
}
}
#endif
#else
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO);
#endif

#if FILTER_LOAD_BLOCK_SIZE == 2
Expand Down Expand Up @@ -1026,7 +1025,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
#if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
const uint offset_ofm = out_f + fi*SIMD + sglid;
Expand All @@ -1046,7 +1045,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
#endif
} // Whole tile_k elements of each iteration : ki

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
#if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
// Dynamic-quantizing group size set to same or smaller than scale group size
if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) {
const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
Expand Down Expand Up @@ -1175,7 +1174,7 @@ KERNEL(fc)(
#endif
) {
#if USE_SLM
#if DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if DYNAMIC_QUANTIZE
__local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD];
#else
__local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD];
Expand Down Expand Up @@ -1317,7 +1316,7 @@ KERNEL(fc)(
#endif
);
} else {
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if USE_SLM && DYNAMIC_QUANTIZE
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down Expand Up @@ -1364,7 +1363,7 @@ KERNEL(fc)(
#endif
}
#else
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if USE_SLM && DYNAMIC_QUANTIZE
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,

if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) {
if (!params.is_shape_agnostic && batch == 1) {
if (should_dynamic_quantize(params))
return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));

// Tuning for Meteor Lake
if (is_weight_vertical(params, output_f)) {
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
Expand Down Expand Up @@ -616,7 +619,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
// Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication
if (should_dynamic_quantize(params)) {
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
namespace ov {
namespace intel_gpu {

ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8) {
ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed() {
using namespace ov::pass::pattern;

auto compressed_constant = [](const ov::Output<ov::Node>& output) {
Expand Down Expand Up @@ -81,6 +81,12 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
bool has_transpose = pattern_map.count(transpose_m);
auto scale_shape = pattern_map.at(mul_const_m).get_shape();
bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { return d > 1; }) > 1;
bool sub_with_convert = (pattern_map.count(sub_with_convert_m) > 0) ? true : false;

auto weight_ptr = std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(weights_m).get_node_shared_ptr());
bool weight_u8 = false;
if (weight_ptr->get_element_type() == ov::element::u8 || weight_ptr->get_element_type() == ov::element::i8)
weight_u8 = true;

auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr<ov::Node> node) {
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
Expand All @@ -97,11 +103,17 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
};

auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr<ov::Node> node) {
auto convert_const_to_u8 = [&](std::shared_ptr<ov::Node> node) {
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8)
// Convert ZP to u8
if (constant->get_element_type() == ov::element::u8)
return std::dynamic_pointer_cast<ov::Node>(constant);
return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
if (constant->get_element_type() == ov::element::u4)
return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
if (weight_u8 && sub_with_convert)
return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));

return std::dynamic_pointer_cast<ov::Node>(constant);
};


Expand All @@ -111,8 +123,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon

const bool with_zero_point = pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0;
if (with_zero_point) {
// WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder
optional_zero_point = convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()));
optional_zero_point = convert_const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()));
}

std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace intel_gpu {
class ConvertFullyConnectedToFullyConnectedCompressed: public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0");
ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8 = false);
ConvertFullyConnectedToFullyConnectedCompressed();
};

} // namespace intel_gpu
Expand Down
9 changes: 5 additions & 4 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::ClampFP16Output>();
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>(device_info.supports_immad);
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();

bool disable_horizontal_fc_fusion = false;
GPU_DEBUG_GET_INSTANCE(debug_config);
Expand All @@ -819,10 +819,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {

if (!disable_horizontal_fc_fusion)
manager.register_pass<ov::intel_gpu::FullyConnectedHorizontalFusion>();

// ZP should not be folded for FC. But still, ZP should be folded for Gather.
// Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
if (device_info.supports_immad) {
// For OneDNN, ZP should not be folded for FC. But still, ZP should be folded for Gather.
// Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
if (disable_horizontal_fc_fusion)
manager.register_pass<ov::pass::ConstantFolding>();
}
Expand Down
Loading

0 comments on commit 77d5c60

Please sign in to comment.