From 621f3f6ead30bb51211a6db46a68328766744c3e Mon Sep 17 00:00:00 2001
From: Hyeongseok Oh <hseok82.oh@samsung.com>
Date: Mon, 19 Aug 2024 18:43:51 +0900
Subject: [PATCH] PoC: Blckwise weight quantize tool for LLM

- Blockwise quantization for LLM: FullyConnected, Gather
- Decide quantize type by circle-quantizer parameter: `--quantize_weights_chunk` (Q4_0, Q8_0)
- Skip quantization by circle-quantizer parameter: `--skip_chunkquant_size` (default: 0)
---
 compiler/circle-quantizer/CMakeLists.txt      |   2 +-
 .../circle-quantizer/src/CircleQuantizer.cpp  |  80 +++++++
 compiler/circle-quantizer/src/QuantizeUtil.h  | 222 ++++++++++++++++++
 .../src/QuantizeWeightsLLM.cpp                | 149 ++++++++++++
 .../circle-quantizer/src/QuantizeWeightsLLM.h |  51 ++++
 .../luci/export/src/CircleTensorExporter.cpp  |  27 +++
 .../service/src/CircleShapeInferenceRule.cpp  |   1 +
 .../service/src/CircleTypeInferenceRule.cpp   |   5 +
 8 files changed, 536 insertions(+), 1 deletion(-)
 create mode 100644 compiler/circle-quantizer/src/QuantizeUtil.h
 create mode 100644 compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp
 create mode 100644 compiler/circle-quantizer/src/QuantizeWeightsLLM.h

diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 16e41a32746..8903eac7b50 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT Jsoncpp_FOUND)
   return()
 endif(NOT Jsoncpp_FOUND)
 
-set (SOURCES src/CircleQuantizer.cpp)
+set (SOURCES src/CircleQuantizer.cpp src/QuantizeWeightsLLM.cpp)
 
 add_executable(circle-quantizer "${SOURCES}")
 target_include_directories(circle-quantizer PRIVATE ${Jsoncpp_INCLUDE_DIRS})
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index f18642f9000..64ee6b4969e 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
+#include "QuantizeWeightsLLM.h"
+
 #include <luci/ImporterEx.h>
 #include <luci/CircleQuantizer.h>
 #include <luci/Service/Validate.h>
 #include <luci/CircleExporter.h>
 #include <luci/CircleFileExpContract.h>
 #include <luci/UserSettings.h>
+#include <luci/IR/CircleNodeDecl.h>
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
@@ -151,6 +154,7 @@ void print_exclusive_options(void)
   std::cout << "    --requantize" << std::endl;
   std::cout << "    --force_quantparam" << std::endl;
   std::cout << "    --quantize_weights" << std::endl;
+  std::cout << "    --quantize_llm" << std::endl;
   std::cout << "    --quantize_onnx_fq_model" << std::endl;
 }
 
@@ -176,6 +180,8 @@ int entry(int argc, char **argv)
   const std::string fake_quant = "--fake_quantize";
   const std::string qw = "--quantize_weights";
   const std::string cfg = "--config";
+  const std::string qllm = "--quantize_weights_chunk";
+  const std::string skip_qllm = "--skip_chunkquant_size";
 
   const std::string tf_maxpool = "--TF-style_maxpool";
 
@@ -221,6 +227,20 @@ int entry(int argc, char **argv)
     .help("Convert a quantized model to a fake-quantized model. NOTE: This feature will "
           "generate an fp32 model.");
 
+  arser.add_argument(qllm)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("FullyConnected weight and Gather param quantization with chunk granualrity. "
+          "One argument requires: type(Q4_0, Q8_0)");
+
+  arser.add_argument(skip_qllm)
+    .nargs(1)
+    .type(arser::DataType::INT32)
+    .default_value(0)
+    .help("Skip weight quantization with chunk granualrity when "
+          "weight is smaller than specified elementsize. "
+          "One argument requires: size (default: 0)");
+
   arser.add_argument(rq)
     .nargs(2)
     .type(arser::DataType::STR_VEC)
@@ -289,6 +309,7 @@ int entry(int argc, char **argv)
     opt_used += arser[cq] ? 1 : 0;
     opt_used += arser[fake_quant] ? 1 : 0;
     opt_used += arser[qw] ? 1 : 0;
+    opt_used += arser[qllm] ? 1 : 0;
     opt_used += arser.get<bool>(qofm) ? 1 : 0;
     if (opt_used != 1)
     {
@@ -465,6 +486,65 @@ int entry(int argc, char **argv)
   if (arser[fake_quant])
     options->enable(Algorithms::ConvertToFakeQuantizedModel);
 
+  if (arser[qllm])
+  {
+    std::string input_path = arser.get<std::string>("input");
+    std::string output_path = arser.get<std::string>("output");
+    std::string type_str = arser.get<std::string>(qllm);
+    auto skip_length = arser.get<int32_t>(skip_qllm);
+    quantizer::QuantizeWeightsLLM::Type qtype = quantizer::QuantizeWeightsLLM::Type::Q4_0;
+    if (type_str == "Q8_0")
+      qtype = quantizer::QuantizeWeightsLLM::Type::Q8_0;
+    else if (type_str != "Q4_0")
+    {
+      std::cerr << "ERROR: Unsupported chunk quantization type" << std::endl;
+      return 255;
+    }
+    if (skip_length < 0)
+    {
+      std::cerr << "ERROR: Skip weight elementsize should be larger than zero" << std::endl;
+      return 255;
+    }
+
+    // Load model from the file
+    luci::ImporterEx importerex;
+    auto module = importerex.importVerifyModule(input_path);
+    if (module.get() == nullptr)
+      return EXIT_FAILURE;
+
+    for (size_t idx = 0; idx < module->size(); ++idx)
+    {
+      auto graph = module->graph(idx);
+
+      // Weight quantization for LLM
+      for (auto node : loco::active_nodes(loco::output_nodes(graph)))
+      {
+        auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+        quantizer::QuantizeWeightsLLM qw(qtype, skip_length);
+        circle_node->accept(&qw);
+      }
+
+      if (!luci::validate(graph))
+      {
+        std::cerr << "ERROR: Quantized graph is invalid" << std::endl;
+        return 255;
+      }
+    }
+
+    // Export to output Circle file
+    luci::CircleExporter exporter;
+
+    luci::CircleFileExpContract contract(module.get(), output_path);
+
+    if (!exporter.invoke(&contract))
+    {
+      std::cerr << "ERROR: Failed to export '" << output_path << "'" << std::endl;
+      return 255;
+    }
+
+    return 0;
+  }
+
   if (arser[qw])
   {
     auto values = arser.get<std::vector<std::string>>(qw);
diff --git a/compiler/circle-quantizer/src/QuantizeUtil.h b/compiler/circle-quantizer/src/QuantizeUtil.h
new file mode 100644
index 00000000000..0a636b636cf
--- /dev/null
+++ b/compiler/circle-quantizer/src/QuantizeUtil.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_QUANTIZE_QUANTIZE_UTIL_H
+#define LUCI_QUANTIZE_QUANTIZE_UTIL_H
+
+#include <cstdint>
+#include <cstddef>
+#include <cassert>
+#include <cmath>
+
+// Copy from llama.cpp
+
+typedef uint16_t ggml_fp16_t;
+
+#define QK4_0 32
+typedef struct
+{
+  ggml_fp16_t d;         // delta
+  uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+#define QK8_0 32
+typedef struct
+{
+  ggml_fp16_t d;    // delta
+  int8_t qs[QK8_0]; // quants
+} block_q8_0;
+
+union block_q4_0_u {
+  uint8_t u8[sizeof(block_q4_0)];
+  block_q4_0 b;
+};
+
+union block_q8_0_u {
+  uint8_t u8[sizeof(block_q8_0)];
+  block_q8_0 b;
+};
+
+static inline uint32_t fp32_to_bits(float f)
+{
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32;
+  fp32.as_value = f;
+  return fp32.as_bits;
+}
+
+static inline float fp32_from_bits(uint32_t w)
+{
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32;
+  fp32.as_bits = w;
+  return fp32.as_value;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f)
+{
+  const float scale_to_inf = 0x1.0p+112f;
+  const float scale_to_zero = 0x1.0p-110f;
+
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000))
+  {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+void quantize_row_q4_0_reference(const float *x, block_q4_0 *y, int k)
+{
+  static const int qk = QK4_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++)
+  {
+    float amax = 0.0f; // absolute max
+    float max = 0.0f;
+
+    for (int j = 0; j < qk; j++)
+    {
+      const float v = x[i * qk + j];
+      if (amax < fabsf(v))
+      {
+        amax = fabsf(v);
+        max = v;
+      }
+    }
+
+    const float d = max / -8;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = GGML_FP32_TO_FP16(d);
+
+    for (int j = 0; j < qk / 2; ++j)
+    {
+      const float x0 = x[i * qk + 0 + j] * id;
+      const float x1 = x[i * qk + qk / 2 + j] * id;
+
+      const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+      const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+      y[i].qs[j] = xi0;
+      y[i].qs[j] |= xi1 << 4;
+    }
+  }
+}
+
+size_t ggml_quantize_q4_0(const float *src, void *dst, int n, int k)
+{
+  assert(k % QK4_0 == 0);
+  const int nb = k / QK4_0;
+
+  for (int b = 0; b < n; b += k)
+  {
+    block_q4_0 *y = (block_q4_0 *)dst + b / QK4_0;
+
+    quantize_row_q4_0_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++)
+    {
+      for (int j = 0; j < QK4_0; j += 2)
+      {
+        const uint8_t vi0 = y[i].qs[j / 2] & 0x0F;
+        const uint8_t vi1 = y[i].qs[j / 2] >> 4;
+      }
+    }
+  }
+
+  return (n / QK4_0 * sizeof(block_q4_0));
+}
+
+void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k)
+{
+  assert(k % QK8_0 == 0);
+  const int nb = k / QK8_0;
+
+  for (int i = 0; i < nb; i++)
+  {
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++)
+    {
+      const float v = x[i * QK8_0 + j];
+      amax = MAX(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = GGML_FP32_TO_FP16(d);
+
+    for (int j = 0; j < QK8_0; ++j)
+    {
+      const float x0 = x[i * QK8_0 + j] * id;
+
+      y[i].qs[j] = roundf(x0);
+    }
+  }
+}
+
+size_t ggml_quantize_q8_0(const float *src, void *dst, int n, int k)
+{
+  assert(k % QK8_0 == 0);
+  const int nb = k / QK8_0;
+
+  for (int b = 0; b < n; b += k)
+  {
+    block_q8_0 *y = (block_q8_0 *)dst + b / QK8_0;
+
+    quantize_row_q8_0_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++)
+    {
+      for (int j = 0; j < QK8_0; ++j)
+      {
+        const int8_t vi = y[i].qs[j];
+      }
+    }
+  }
+
+  return (n / QK8_0 * sizeof(block_q8_0));
+}
+
+#endif // LUCI_QUANTIZE_QUANTIZE_UTIL_H
diff --git a/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp b/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp
new file mode 100644
index 00000000000..2209dfb08ac
--- /dev/null
+++ b/compiler/circle-quantizer/src/QuantizeWeightsLLM.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizeWeightsLLM.h"
+
+#include "QuantizeUtil.h"
+
+#include <luci/Service/Nodes/CircleConst.h>
+
+namespace
+{
+
+bool is_quantized(const luci::CircleConst *node)
+{
+  return node->quantparam() != nullptr || (node->dtype() != loco::DataType::FLOAT32);
+}
+
+size_t elementsize(const luci::CircleConst *node)
+{
+  size_t elems = 1;
+  for (uint32_t i = 0; i < node->rank(); ++i)
+    elems *= node->dim(i).value();
+  return elems;
+}
+
+luci::CircleConst *quantize_q8_block(luci::CircleConst *node)
+{
+  auto new_weights = luci::clone(node);
+
+  // Check block size
+  auto last_dim = node->dim(node->rank() - 1).value();
+  assert(last_dim % QK8_0 == 0);
+
+  // Get num of block
+  size_t blocks = 1;
+  for (uint32_t i = 0; i < new_weights->rank(); ++i)
+    blocks *= new_weights->dim(i).value();
+  blocks /= QK8_0;
+
+  // Set data for each block
+  block_q8_0_u block;
+  // Fake data type for resize and write
+  new_weights->dtype(loco::DataType::U8);
+  new_weights->size<loco::DataType::U8>(sizeof(block) * blocks);
+  for (size_t i = 0; i < blocks; ++i)
+  {
+    // Read float data
+    float data[QK8_0];
+    for (size_t j = 0; j < QK8_0; ++j)
+      data[j] = node->at<loco::DataType::FLOAT32>(i * QK8_0 + j);
+
+    ggml_quantize_q8_0(data, &block.b, QK8_0, QK8_0);
+
+    for (auto j = 0; j < sizeof(block.u8); j++)
+      new_weights->at<loco::DataType::U8>(i * sizeof(block.u8) + j) = block.u8[j];
+  }
+
+  // Set real data type
+  new_weights->dtype(loco::DataType::S8);
+  return new_weights;
+}
+
+luci::CircleConst *quantize_q4_block(luci::CircleConst *node)
+{
+  auto new_weights = luci::clone(node);
+
+  // Check block size
+  auto last_dim = node->dim(node->rank() - 1).value();
+  assert(last_dim % QK4_0 == 0);
+
+  // Get num of block
+  size_t blocks = 1;
+  for (uint32_t i = 0; i < new_weights->rank(); ++i)
+    blocks *= new_weights->dim(i).value();
+  blocks /= QK4_0;
+
+  // Set data for each block
+  block_q4_0_u block;
+  // Fake data type for resize and write
+  new_weights->dtype(loco::DataType::U8);
+  new_weights->size<loco::DataType::U8>(sizeof(block) * blocks);
+  for (size_t i = 0; i < blocks; ++i)
+  {
+    // Read float data
+    float data[QK4_0];
+    for (size_t j = 0; j < QK4_0; ++j)
+      data[j] = node->at<loco::DataType::FLOAT32>(i * QK4_0 + j);
+
+    ggml_quantize_q4_0(data, &block.b, QK4_0, QK4_0);
+
+    for (auto j = 0; j < sizeof(block.u8); j++)
+      new_weights->at<loco::DataType::U8>(i * sizeof(block.u8) + j) = block.u8[j];
+  }
+
+  // Set real data type
+  new_weights->dtype(loco::DataType::U4);
+
+  return new_weights;
+}
+
+} // namespace
+
+namespace quantizer
+{
+
+void QuantizeWeightsLLM::visit(luci::CircleFullyConnected *node)
+{
+  auto weights = loco::must_cast<luci::CircleConst *>(node->weights());
+  if (elementsize(weights) < _skip_length)
+    return;
+
+  if (!is_quantized(weights))
+  {
+    auto new_weights =
+      _quant_type == Type::Q4_0 ? quantize_q4_block(weights) : quantize_q8_block(weights);
+    node->weights(new_weights);
+  }
+}
+
+void QuantizeWeightsLLM::visit(luci::CircleGather *node)
+{
+  auto input = loco::must_cast<luci::CircleConst *>(node->arg(0));
+  if (elementsize(input) < _skip_length)
+    return;
+
+  if (!is_quantized(input))
+  {
+    auto new_weights =
+      _quant_type == Type::Q4_0 ? quantize_q4_block(input) : quantize_q8_block(input);
+    node->params(new_weights);
+  }
+}
+
+void QuantizeWeightsLLM::visit(luci::CircleNode *) {}
+
+} // namespace quantizer
diff --git a/compiler/circle-quantizer/src/QuantizeWeightsLLM.h b/compiler/circle-quantizer/src/QuantizeWeightsLLM.h
new file mode 100644
index 00000000000..2b908573875
--- /dev/null
+++ b/compiler/circle-quantizer/src/QuantizeWeightsLLM.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_QUANTIZE_WEIGHTS_LLM_H
+#define LUCI_QUANTIZE_WEIGHTS_LLM_H
+
+#include <luci/IR/CircleNodeVisitor.h>
+
+namespace quantizer
+{
+
+class QuantizeWeightsLLM : public luci::CircleNodeMutableVisitor<void>
+{
+public:
+  enum Type
+  {
+    Q4_0,
+    Q8_0
+  };
+
+public:
+  QuantizeWeightsLLM(Type type, int32_t skip_length) : _quant_type(type), _skip_length(skip_length)
+  {
+  }
+
+private:
+  void visit(luci::CircleFullyConnected *node);
+  void visit(luci::CircleGather *node);
+  void visit(luci::CircleNode *);
+
+private:
+  Type _quant_type;
+  int32_t _skip_length;
+};
+
+} // namespace quantizer
+
+#endif // LUCI_QUANTIZE_WEIGHTS_LLM_H
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 0022a0e57e8..1d6cea3681a 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -434,6 +434,29 @@ flatbuffers::Offset<circle::Buffer> encodeOpBufferPack4bit(FlatBufferBuilder &bu
   return CreateBuffer(builder, array_offset);
 }
 
+template <loco::DataType DT>
+flatbuffers::Offset<circle::Buffer> encodeOpBufferBlocked(FlatBufferBuilder &builder,
+                                                          luci::CircleConst *c)
+{
+  // native type is uint8
+
+  const uint32_t size = c->size<DT>();
+  std::vector<uint8_t> raw_data;
+  raw_data.reserve(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    raw_data.push_back(c->at<DT>(i));
+  }
+
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    raw_data.push_back(static_cast<uint8_t>(c->at<DT>(i)));
+  }
+
+  auto array_offset = builder.CreateVector(reinterpret_cast<uint8_t *>(raw_data.data()), size);
+  return CreateBuffer(builder, array_offset);
+}
+
 template <>
 flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder, luci::CircleConst *c)
 {
@@ -444,6 +467,8 @@ flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder, l
     case loco::DataType::S4:
       return encodeOpBufferPack4bit<loco::DataType::S4>(builder, c);
     case loco::DataType::S8:
+      if (c->quantparam() == nullptr)
+        return encodeOpBufferBlocked<loco::DataType::S8>(builder, c);
       return encodeOpBufferByDType<loco::DataType::S8>(builder, c);
     case loco::DataType::S16:
       return encodeOpBufferByDType<loco::DataType::S16>(builder, c);
@@ -452,6 +477,8 @@ flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder, l
     case loco::DataType::S64:
       return encodeOpBufferByDType<loco::DataType::S64>(builder, c);
     case loco::DataType::U4:
+      if (c->quantparam() == nullptr)
+        return encodeOpBufferBlocked<loco::DataType::U4>(builder, c);
       return encodeOpBufferPack4bit<loco::DataType::U4>(builder, c);
     case loco::DataType::U8:
       return encodeOpBufferByDType<loco::DataType::U8>(builder, c);
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index 63eb4fb7e37..628b306b782 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -19,6 +19,7 @@
 #include "Check.h"
 
 #include "CircleShapeInferenceHelper.h"
+#include "CircleTypeInferenceHelper.h"
 #include "ShapeInfer_StridedSlice.h"
 
 #include <luci/IR/CircleNodes.h>
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index 78dde1004b5..e21e026629d 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -174,6 +174,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleGather *node) final
   {
+    // TODO Check block quantization
+    if (luci::dtype_get(node->params()) == loco::DataType::U4 ||
+        luci::dtype_get(node->params()) == loco::DataType::S8)
+      return loco::DataType::FLOAT32;
+
     return luci::dtype_get(node->params());
   }