diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt index 16e41a32746..8903eac7b50 100644 --- a/compiler/circle-quantizer/CMakeLists.txt +++ b/compiler/circle-quantizer/CMakeLists.txt @@ -4,7 +4,7 @@ if(NOT Jsoncpp_FOUND) return() endif(NOT Jsoncpp_FOUND) -set (SOURCES src/CircleQuantizer.cpp) +set (SOURCES src/CircleQuantizer.cpp src/QuantizeWeightsLLM.cpp) add_executable(circle-quantizer "${SOURCES}") target_include_directories(circle-quantizer PRIVATE ${Jsoncpp_INCLUDE_DIRS}) diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp index f18642f9000..64ee6b4969e 100644 --- a/compiler/circle-quantizer/src/CircleQuantizer.cpp +++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ +#include "QuantizeWeightsLLM.h" + #include #include #include #include #include #include +#include #include #include @@ -151,6 +154,7 @@ void print_exclusive_options(void) std::cout << " --requantize" << std::endl; std::cout << " --force_quantparam" << std::endl; std::cout << " --quantize_weights" << std::endl; + std::cout << " --quantize_llm" << std::endl; std::cout << " --quantize_onnx_fq_model" << std::endl; } @@ -176,6 +180,8 @@ int entry(int argc, char **argv) const std::string fake_quant = "--fake_quantize"; const std::string qw = "--quantize_weights"; const std::string cfg = "--config"; + const std::string qllm = "--quantize_weights_chunk"; + const std::string skip_qllm = "--skip_chunkquant_size"; const std::string tf_maxpool = "--TF-style_maxpool"; @@ -221,6 +227,20 @@ int entry(int argc, char **argv) .help("Convert a quantized model to a fake-quantized model. NOTE: This feature will " "generate an fp32 model."); + arser.add_argument(qllm) + .nargs(1) + .type(arser::DataType::STR) + .help("FullyConnected weight and Gather param quantization with chunk granualrity. " + "One argument requires: type(Q4_0, Q8_0)"); + + arser.add_argument(skip_qllm) + .nargs(1) + .type(arser::DataType::INT32) + .default_value(0) + .help("Skip weight quantization with chunk granualrity when " + "weight is smaller than specified elementsize. " + "One argument requires: size (default: 0)"); + arser.add_argument(rq) .nargs(2) .type(arser::DataType::STR_VEC) @@ -289,6 +309,7 @@ int entry(int argc, char **argv) opt_used += arser[cq] ? 1 : 0; opt_used += arser[fake_quant] ? 1 : 0; opt_used += arser[qw] ? 1 : 0; + opt_used += arser[qllm] ? 1 : 0; opt_used += arser.get(qofm) ? 1 : 0; if (opt_used != 1) { @@ -465,6 +486,65 @@ int entry(int argc, char **argv) if (arser[fake_quant]) options->enable(Algorithms::ConvertToFakeQuantizedModel); + if (arser[qllm]) + { + std::string input_path = arser.get("input"); + std::string output_path = arser.get("output"); + std::string type_str = arser.get(qllm); + auto skip_length = arser.get(skip_qllm); + quantizer::QuantizeWeightsLLM::Type qtype = quantizer::QuantizeWeightsLLM::Type::Q4_0; + if (type_str == "Q8_0") + qtype = quantizer::QuantizeWeightsLLM::Type::Q8_0; + else if (type_str != "Q4_0") + { + std::cerr << "ERROR: Unsupported chunk quantization type" << std::endl; + return 255; + } + if (skip_length < 0) + { + std::cerr << "ERROR: Skip weight elementsize should be larger than zero" << std::endl; + return 255; + } + + // Load model from the file + luci::ImporterEx importerex; + auto module = importerex.importVerifyModule(input_path); + if (module.get() == nullptr) + return EXIT_FAILURE; + + for (size_t idx = 0; idx < module->size(); ++idx) + { + auto graph = module->graph(idx); + + // Weight quantization for LLM + for (auto node : loco::active_nodes(loco::output_nodes(graph))) + { + auto circle_node = loco::must_cast(node); + quantizer::QuantizeWeightsLLM qw(qtype, skip_length); + circle_node->accept(&qw); + } + + if (!luci::validate(graph)) + { + std::cerr << "ERROR: Quantized graph is invalid" << std::endl; + return 255; + } + } + + // Export to output Circle file + luci::CircleExporter exporter; + + luci::CircleFileExpContract contract(module.get(), output_path); + + if (!exporter.invoke(&contract)) + { + std::cerr << "ERROR: Failed to export '" << output_path << "'" << std::endl; + return 255; + } + + return 0; + } + if (arser[qw]) { auto values = arser.get>(qw); diff --git a/compiler/circle-quantizer/src/QuantizeUtil.h b/compiler/circle-quantizer/src/QuantizeUtil.h new file mode 100644 index 00000000000..0a636b636cf --- /dev/null +++ b/compiler/circle-quantizer/src/QuantizeUtil.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2023 Georgi Gerganov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_QUANTIZE_QUANTIZE_UTIL_H +#define LUCI_QUANTIZE_QUANTIZE_UTIL_H + +#include +#include +#include +#include + +// Copy from llama.cpp + +typedef uint16_t ggml_fp16_t; + +#define QK4_0 32 +typedef struct +{ + ggml_fp16_t d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; + +#define QK8_0 32 +typedef struct +{ + ggml_fp16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + +union block_q4_0_u { + uint8_t u8[sizeof(block_q4_0)]; + block_q4_0 b; +}; + +union block_q8_0_u { + uint8_t u8[sizeof(block_q8_0)]; + block_q8_0 b; +}; + +static inline uint32_t fp32_to_bits(float f) +{ + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float fp32_from_bits(uint32_t w) +{ + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) +{ + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; + + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) + { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +void quantize_row_q4_0_reference(const float *x, block_q4_0 *y, int k) +{ + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) + { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) + { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) + { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f / d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < qk / 2; ++j) + { + const float x0 = x[i * qk + 0 + j] * id; + const float x1 = x[i * qk + qk / 2 + j] * id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +size_t ggml_quantize_q4_0(const float *src, void *dst, int n, int k) +{ + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; + + for (int b = 0; b < n; b += k) + { + block_q4_0 *y = (block_q4_0 *)dst + b / QK4_0; + + quantize_row_q4_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) + { + for (int j = 0; j < QK4_0; j += 2) + { + const uint8_t vi0 = y[i].qs[j / 2] & 0x0F; + const uint8_t vi1 = y[i].qs[j / 2] >> 4; + } + } + } + + return (n / QK4_0 * sizeof(block_q4_0)); +} + +void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) +{ + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) + { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) + { + const float v = x[i * QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < QK8_0; ++j) + { + const float x0 = x[i * QK8_0 + j] * id; + + y[i].qs[j] = roundf(x0); + } + } +} + +size_t ggml_quantize_q8_0(const float *src, void *dst, int n, int k) +{ + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int b = 0; b < n; b += k) + { + block_q8_0 *y = (block_q8_0 *)dst + b / QK8_0; + + quantize_row_q8_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) + { + for (int j = 0; j < QK8_0; ++j) + { + const int8_t vi = y[i].qs[j]; + } + } + } + + return (n / QK8_0 * sizeof(block_q8_0)); +} + +#endif // LUCI_QUANTIZE_QUANTIZE_UTIL_H diff --git a/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp b/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp new file mode 100644 index 00000000000..2209dfb08ac --- /dev/null +++ b/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "QuantizeWeightsLLM.h" + +#include "QuantizeUtil.h" + +#include + +namespace +{ + +bool is_quantized(const luci::CircleConst *node) +{ + return node->quantparam() != nullptr || (node->dtype() != loco::DataType::FLOAT32); +} + +size_t elementsize(const luci::CircleConst *node) +{ + size_t elems = 1; + for (uint32_t i = 0; i < node->rank(); ++i) + elems *= node->dim(i).value(); + return elems; +} + +luci::CircleConst *quantize_q8_block(luci::CircleConst *node) +{ + auto new_weights = luci::clone(node); + + // Check block size + auto last_dim = node->dim(node->rank() - 1).value(); + assert(last_dim % QK8_0 == 0); + + // Get num of block + size_t blocks = 1; + for (uint32_t i = 0; i < new_weights->rank(); ++i) + blocks *= new_weights->dim(i).value(); + blocks /= QK8_0; + + // Set data for each block + block_q8_0_u block; + // Fake data type for resize and write + new_weights->dtype(loco::DataType::U8); + new_weights->size(sizeof(block) * blocks); + for (size_t i = 0; i < blocks; ++i) + { + // Read float data + float data[QK8_0]; + for (size_t j = 0; j < QK8_0; ++j) + data[j] = node->at(i * QK8_0 + j); + + ggml_quantize_q8_0(data, &block.b, QK8_0, QK8_0); + + for (auto j = 0; j < sizeof(block.u8); j++) + new_weights->at(i * sizeof(block.u8) + j) = block.u8[j]; + } + + // Set real data type + new_weights->dtype(loco::DataType::S8); + return new_weights; +} + +luci::CircleConst *quantize_q4_block(luci::CircleConst *node) +{ + auto new_weights = luci::clone(node); + + // Check block size + auto last_dim = node->dim(node->rank() - 1).value(); + assert(last_dim % QK4_0 == 0); + + // Get num of block + size_t blocks = 1; + for (uint32_t i = 0; i < new_weights->rank(); ++i) + blocks *= new_weights->dim(i).value(); + blocks /= QK4_0; + + // Set data for each block + block_q4_0_u block; + // Fake data type for resize and write + new_weights->dtype(loco::DataType::U8); + new_weights->size(sizeof(block) * blocks); + for (size_t i = 0; i < blocks; ++i) + { + // Read float data + float data[QK4_0]; + for (size_t j = 0; j < QK4_0; ++j) + data[j] = node->at(i * QK4_0 + j); + + ggml_quantize_q4_0(data, &block.b, QK4_0, QK4_0); + + for (auto j = 0; j < sizeof(block.u8); j++) + new_weights->at(i * sizeof(block.u8) + j) = block.u8[j]; + } + + // Set real data type + new_weights->dtype(loco::DataType::U4); + + return new_weights; +} + +} // namespace + +namespace quantizer +{ + +void QuantizeWeightsLLM::visit(luci::CircleFullyConnected *node) +{ + auto weights = loco::must_cast(node->weights()); + if (elementsize(weights) < _skip_length) + return; + + if (!is_quantized(weights)) + { + auto new_weights = + _quant_type == Type::Q4_0 ? quantize_q4_block(weights) : quantize_q8_block(weights); + node->weights(new_weights); + } +} + +void QuantizeWeightsLLM::visit(luci::CircleGather *node) +{ + auto input = loco::must_cast(node->arg(0)); + if (elementsize(input) < _skip_length) + return; + + if (!is_quantized(input)) + { + auto new_weights = + _quant_type == Type::Q4_0 ? quantize_q4_block(input) : quantize_q8_block(input); + node->params(new_weights); + } +} + +void QuantizeWeightsLLM::visit(luci::CircleNode *) {} + +} // namespace quantizer diff --git a/compiler/circle-quantizer/src/QuantizeWeightsLLM.h b/compiler/circle-quantizer/src/QuantizeWeightsLLM.h new file mode 100644 index 00000000000..2b908573875 --- /dev/null +++ b/compiler/circle-quantizer/src/QuantizeWeightsLLM.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_QUANTIZE_WEIGHTS_LLM_H +#define LUCI_QUANTIZE_WEIGHTS_LLM_H + +#include + +namespace quantizer +{ + +class QuantizeWeightsLLM : public luci::CircleNodeMutableVisitor +{ +public: + enum Type + { + Q4_0, + Q8_0 + }; + +public: + QuantizeWeightsLLM(Type type, int32_t skip_length) : _quant_type(type), _skip_length(skip_length) + { + } + +private: + void visit(luci::CircleFullyConnected *node); + void visit(luci::CircleGather *node); + void visit(luci::CircleNode *); + +private: + Type _quant_type; + int32_t _skip_length; +}; + +} // namespace quantizer + +#endif // LUCI_QUANTIZE_WEIGHTS_LLM_H diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp index 0022a0e57e8..1d6cea3681a 100644 --- a/compiler/luci/export/src/CircleTensorExporter.cpp +++ b/compiler/luci/export/src/CircleTensorExporter.cpp @@ -434,6 +434,29 @@ flatbuffers::Offset encodeOpBufferPack4bit(FlatBufferBuilder &bu return CreateBuffer(builder, array_offset); } +template +flatbuffers::Offset encodeOpBufferBlocked(FlatBufferBuilder &builder, + luci::CircleConst *c) +{ + // native type is uint8 + + const uint32_t size = c->size
(); + std::vector raw_data; + raw_data.reserve(size); + for (uint32_t i = 0; i < size; ++i) + { + raw_data.push_back(c->at
(i)); + } + + for (uint32_t i = 0; i < size; ++i) + { + raw_data.push_back(static_cast(c->at
(i))); + } + + auto array_offset = builder.CreateVector(reinterpret_cast(raw_data.data()), size); + return CreateBuffer(builder, array_offset); +} + template <> flatbuffers::Offset encodeOpBuffer(FlatBufferBuilder &builder, luci::CircleConst *c) { @@ -444,6 +467,8 @@ flatbuffers::Offset encodeOpBuffer(FlatBufferBuilder &builder, l case loco::DataType::S4: return encodeOpBufferPack4bit(builder, c); case loco::DataType::S8: + if (c->quantparam() == nullptr) + return encodeOpBufferBlocked(builder, c); return encodeOpBufferByDType(builder, c); case loco::DataType::S16: return encodeOpBufferByDType(builder, c); @@ -452,6 +477,8 @@ flatbuffers::Offset encodeOpBuffer(FlatBufferBuilder &builder, l case loco::DataType::S64: return encodeOpBufferByDType(builder, c); case loco::DataType::U4: + if (c->quantparam() == nullptr) + return encodeOpBufferBlocked(builder, c); return encodeOpBufferPack4bit(builder, c); case loco::DataType::U8: return encodeOpBufferByDType(builder, c); diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp index 63eb4fb7e37..628b306b782 100644 --- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp +++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp @@ -19,6 +19,7 @@ #include "Check.h" #include "CircleShapeInferenceHelper.h" +#include "CircleTypeInferenceHelper.h" #include "ShapeInfer_StridedSlice.h" #include diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp index 78dde1004b5..e21e026629d 100644 --- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp +++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp @@ -174,6 +174,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitorparams()) == loco::DataType::U4 || + luci::dtype_get(node->params()) == loco::DataType::S8) + return loco::DataType::FLOAT32; + return luci::dtype_get(node->params()); }