Add support for QuantizedLSTM operation

This change also adds support for dequantization for GNA as GNAPlugin does not support Convert operation. TEST=asr-perf-eval runs succesfully for encoder 0 and encoder 1
intel · Sep 12, 2021 · d767a31 · d767a31
1 parent 05893b1
commit d767a31
Show file tree

Hide file tree

Showing 16 changed files with 717 additions and 31 deletions.
diff --git a/BUILD.gn b/BUILD.gn
@@ -118,6 +118,7 @@ shared_library("intel_nnhal") {
     "ngraph_creator/operations/src/Pad_V2.cpp",
     "ngraph_creator/operations/src/Pow.cpp",
     "ngraph_creator/operations/src/Quantize.cpp",
+    "ngraph_creator/operations/src/QuantizedLSTM.cpp",
     "ngraph_creator/operations/src/Reduce_All.cpp",
     "ngraph_creator/operations/src/Reduce_Any.cpp",
     "ngraph_creator/operations/src/Reduce_Max.cpp",
@@ -194,9 +195,10 @@ shared_library("intel_nnhal") {
     "nnapi-support",
     "ngraph",
     "inference_engine",
-    "nn-common", 
+    "nn-common",
     "ssl",
-    "crypto"
+    "crypto",
+    "MKLDNNPlugin"
   ]
   lib_dirs = [
     "${sysroot}/usr/local/deployment_tools/inference_engine/lib/intel64/",

diff --git a/BasePreparedModel.cpp b/BasePreparedModel.cpp
@@ -180,6 +180,8 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod
             operandType == OperandType::TENSOR_QUANT8_SYMM ||
             operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL)
             expectedLength /= 4;  // 8bit expected instead of 32bit
+        else if(operandType == OperandType::TENSOR_QUANT16_SYMM)
+            expectedLength /= 2;  // 16bit expected instead of 32bit
         if (rActualLength != expectedLength) {
             ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
                   rActualLength, expectedLength);
@@ -203,12 +205,26 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod
                 break;
             }
             case OperandType::TENSOR_QUANT8_ASYMM: {
-                floatToUint8(srcBlob->buffer().as<float*>(), (uint8_t*)destPtr, srcBlob->size());
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((uint8_t*)destPtr + i) = static_cast<uint8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
                 break;
             }
             case OperandType::TENSOR_QUANT8_SYMM:
+            case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
             case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
-                floatToint8(srcBlob->buffer().as<float*>(), (int8_t*)destPtr, srcBlob->size());
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((int8_t*)destPtr + i) = static_cast<int8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
+                break;
+            }
+            case OperandType::TENSOR_QUANT16_SYMM: {
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((int16_t*)destPtr + i) = static_cast<int16_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
                 break;
             }
             default:
@@ -295,16 +311,21 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
         auto outDims = srcBlob->getTensorDesc().getDims();
         if (operandType == OperandType::TENSOR_BOOL8 ||
             operandType == OperandType::TENSOR_QUANT8_ASYMM ||
+            operandType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED ||
             operandType == OperandType::TENSOR_QUANT8_SYMM ||
             operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL)
             expectedLength /= 4;  // 8bit expected instead of 32bit
+        else if(operandType == OperandType::TENSOR_QUANT16_SYMM)
+            expectedLength /= 2;  // 16bit expected instead of 32bit
         if (rActualLength != expectedLength) {
             ALOGE("%s Invalid length(%d) at outIndex(%d)", __func__, rActualLength, outIndex);
             // Notify Insufficient Buffer Length to modelInfo
             modelInfo->updateOutputshapes(i, outDims, false);
             return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
         } else
             modelInfo->updateOutputshapes(i, outDims);
+        float sc;
+        int32_t zp;
         switch (operandType) {
             case OperandType::TENSOR_INT32:
             case OperandType::TENSOR_FLOAT32: {
@@ -317,12 +338,26 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
                 break;
             }
             case OperandType::TENSOR_QUANT8_ASYMM: {
-                floatToUint8(srcBlob->buffer().as<float*>(), (uint8_t*)destPtr, srcBlob->size());
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((uint8_t*)destPtr + i) = static_cast<uint8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
                 break;
             }
             case OperandType::TENSOR_QUANT8_SYMM:
+            case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
             case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
-                floatToint8(srcBlob->buffer().as<float*>(), (int8_t*)destPtr, srcBlob->size());
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((int8_t*)destPtr + i) = static_cast<int8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
+                break;
+            }
+            case OperandType::TENSOR_QUANT16_SYMM: {
+                modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
+                for (int i = 0; i < srcBlob->size() ; i++) {
+                    *((int16_t*)destPtr + i) = static_cast<int16_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
+                }
                 break;
             }
             default:

diff --git a/ModelManager.cpp b/ModelManager.cpp
@@ -66,7 +66,10 @@ bool NnapiModelInfo::initializeRunTimeOperandInfo() {
             case OperandType::TENSOR_QUANT8_ASYMM:
             case OperandType::TENSOR_QUANT8_SYMM:
             case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL:
+            case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
+            case OperandType::TENSOR_QUANT16_SYMM:
                 to.type = from.type;
+                to.scale = from.scale;
                 break;
             default:
                 ALOGE("wrong operand type %d", from.type);
@@ -284,7 +287,8 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui
             return blob;
         }
     } else if (op.type == OperandType::TENSOR_QUANT8_SYMM ||
-               op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
+               op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL ||
+               op.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
         ALOGV(
             "check if tensors of type TENSOR_QUANT8_SYMM/TENSOR_QUANT8_SYMM_PER_CHANNEL  "
             "supported");
@@ -302,6 +306,22 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui
             return blob;
         }
     }
+    else if (op.type == OperandType::TENSOR_QUANT16_SYMM) {
+        ALOGV("check if tensors of type TENSOR_QUANT16_SYMM supported");
+        InferenceEngine::TensorDesc td(InferenceEngine::Precision::I16, toDims(op.dimensions),
+                                       InferenceEngine::Layout::ANY);
+        if (buf == nullptr) {
+            ALOGD("TENSOR_QUANT16_SYMM buf is NULL !!!!!!!!!!!!!!!");
+            InferenceEngine::TBlob<int16_t>::Ptr blob =
+                std::make_shared<InferenceEngine::TBlob<int16_t>>(td);
+            blob->allocate();
+            return blob;
+        } else {
+            InferenceEngine::TBlob<int16_t>::Ptr blob =
+                std::make_shared<InferenceEngine::TBlob<int16_t>>(td, (int16_t*)buf, len);
+            return blob;
+        }
+    }
     return nullptr;
 }
 

diff --git a/ModelManager.h b/ModelManager.h
@@ -104,6 +104,13 @@ class NnapiModelInfo {
         return operand.zeroPoint;
     }
 
+    void getOperandScaleZeroPoint(int index, float& scale, int32_t& zp) {
+        auto operand = getOperand(index);
+        scale = operand.scale;
+        zp = operand.zeroPoint;
+        return;
+    }
+
     RunTimeOperandInfo& getRuntimeOperand(uint32_t index) {
         return mOperands[mModel.main.inputIndexes[index]];
     }

diff --git a/gna/GnaPreparedModel.cpp b/gna/GnaPreparedModel.cpp
@@ -39,15 +39,21 @@ bool GnaPreparedModel::initialize(const Model& model) {
         ALOGE("%s ngraph generation failed", __func__);
         return false;
     }
-    auto ngraph_net = std::make_shared<InferenceEngine::CNNNetwork>(ngraph_function);
+    try {
+        auto ngraph_net = std::make_shared<InferenceEngine::CNNNetwork>(ngraph_function);
 #if __ANDROID__
-    ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml",
+        ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml",
                           "/data/vendor/neuralnetworks/ngraph_ir.bin");
 #else
-    ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin");
+        ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin");
 #endif
-    mPlugin = std::make_shared<IENetwork>(ngraph_net);
-    mPlugin->loadNetwork();
+        mPlugin = std::make_shared<IENetwork>(ngraph_net);
+        mPlugin->loadNetwork();
+    } catch (const std::exception& ex) {
+        ALOGE("%s Exception !!! %s", __func__, ex.what());
+        return false;
+    }
+
 
     ALOGV("Exiting %s", __func__);
     return true;

diff --git a/ngraph_creator/include/NgraphNetworkCreator.hpp b/ngraph_creator/include/NgraphNetworkCreator.hpp
@@ -17,6 +17,7 @@ class NgraphNetworkCreator {
     std::vector<std::shared_ptr<OperationsBase>> mOperationNodes;
     std::shared_ptr<NgraphNodes> mNgraphNodes;
     OperationsFactory mOpFactoryInstance;
+    const IntelDeviceType mPluginType;
     bool createInputParams();
     bool initializeModel();
 

diff --git a/ngraph_creator/include/NgraphNodes.hpp b/ngraph_creator/include/NgraphNodes.hpp
@@ -17,7 +17,8 @@ class NgraphNodes {
     // in the path to current Operand.
     std::vector<bool> mForcedNchw;
     std::vector<std::shared_ptr<ngraph::opset3::Parameter>> mInputParams;
-    std::vector<std::shared_ptr<ngraph::Node>> mResultNodes;
+    std::vector<std::shared_ptr<ngraph::op::Result>> mResultNodes;
+    std::vector<std::shared_ptr<ngraph::op::Sink>> mSinkNodes;
     // mNodeNames are only populated when requested, as only Inputs and Result NodeNames are
     // required.
     std::map<int, std::string> mNodeNames;
@@ -30,6 +31,8 @@ class NgraphNodes {
     void setOutputAtOperandIndex(size_t index, ngraph::Output<ngraph::Node> output);
     ngraph::Output<ngraph::Node> getOperationOutput(size_t index);
     void setResultNode(size_t outputIndex, std::shared_ptr<ngraph::Node> resultNode);
+    void setSinkNode(std::shared_ptr<ngraph::op::Sink> sinkNode);
+
 
     const std::string& getNodeName(size_t index);
     void removeInputParameter(std::string name, size_t index);

diff --git a/ngraph_creator/include/OperationsFactory.hpp b/ngraph_creator/include/OperationsFactory.hpp
@@ -46,6 +46,7 @@
 #include <Pad_V2.hpp>
 #include <Pow.hpp>
 #include <Quantize.hpp>
+#include <QuantizedLSTM.hpp>
 #include <RNN.hpp>
 #include <ROI_Align.hpp>
 #include <ROI_Pooling.hpp>

diff --git a/ngraph_creator/operations/include/OperationsBase.hpp b/ngraph_creator/operations/include/OperationsBase.hpp
@@ -37,6 +37,7 @@ class OperationsBase {
     // override createNodeForPlugin in case sPluginType specific implementation is required
     virtual std::shared_ptr<ngraph::Node> createNodeForPlugin();
     void addResultNode(size_t index, std::shared_ptr<ngraph::Node> resultNode);
+    void addSinkNode(std::shared_ptr<ngraph::op::Sink> sinkNode);
 
     // helper functions
     bool checkOperandType(uint32_t operandIndex, const int32_t expectedOperandType,
@@ -46,12 +47,30 @@ class OperationsBase {
     const vec<uint32_t> getInputOperandDimensions(uint32_t inputIndex);
     bool isValidInputTensor(uint32_t inputIndex);
 
+    template<typename T>
+    bool deQuantize(const T* inputData, const uint32_t& len, const float scale,
+                const int32_t zeroPoint, float* outputData) {
+        int32_t value;
+        for (int i = 0; i < len; ++i) {
+            value = *(inputData + i);
+            outputData[i] = static_cast<float>(scale * (value - zeroPoint));
+        }
+    return true;
+    }
+
     std::shared_ptr<ngraph::Node> getInputNode(uint32_t inputIndex, bool dequantize = true) {
         std::shared_ptr<ngraph::Node> input;
         auto operandIndex = sModelInfo->getOperationInput(mNnapiOperationIndex, inputIndex);
         auto operandType = sModelInfo->getOperandType(operandIndex);
+        float scale;
+	    int32_t zp;
         if (sModelInfo->isOperandLifeTimeConst(operandIndex)) {
             auto operandDims = getInputOperandDimensions(inputIndex);
+            std::vector<float> f_operandValues;
+
+            if (sPluginType == IntelDeviceType::GNA) {
+                sModelInfo->getOperandScaleZeroPoint(operandIndex, scale, zp);
+            }
             ngraph::element::Type elementType;
             switch (operandType) {
                 case OperandType::TENSOR_FLOAT32: {
@@ -61,9 +80,16 @@ class OperationsBase {
                     break;
                 }
                 case OperandType::TENSOR_INT32: {
-                    elementType = ngraph::element::i32;
                     auto operandValues = sModelInfo->GetConstVecOperand<int>(operandIndex);
-                    input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    if (sPluginType == IntelDeviceType::GNA) {
+                        elementType = ngraph::element::f32;
+                        f_operandValues.resize(operandValues.size());
+                        deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
+                    }
+                    else {
+                        elementType = ngraph::element::i32;
+                        input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    }
                     break;
                 }
                 case OperandType::TENSOR_BOOL8: {
@@ -73,16 +99,44 @@ class OperationsBase {
                     break;
                 }
                 case OperandType::TENSOR_QUANT8_ASYMM: {
-                    elementType = ngraph::element::u8;
                     auto operandValues = sModelInfo->GetConstVecOperand<uint8_t>(operandIndex);
-                    input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    if (sPluginType == IntelDeviceType::GNA) {
+                        elementType = ngraph::element::f32;
+                        f_operandValues.resize(operandValues.size());
+                        deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
+                    }
+                    else {
+                        elementType = ngraph::element::u8;
+                        input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    }
                     break;
                 }
                 case OperandType::TENSOR_QUANT8_SYMM:
+                case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
                 case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
-                    elementType = ngraph::element::i8;
                     auto operandValues = sModelInfo->GetConstVecOperand<int8_t>(operandIndex);
-                    input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    if (sPluginType == IntelDeviceType::GNA) {
+                        elementType = ngraph::element::f32;
+                        f_operandValues.resize(operandValues.size());
+                        deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
+                    }
+                    else {
+                        elementType = ngraph::element::i8;
+                        input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+                    }
+                    break;
+                }
+                case OperandType::TENSOR_QUANT16_SYMM: {
+                    auto operandValues = sModelInfo->GetConstVecOperand<int16_t>(operandIndex);
+                    if (sPluginType == IntelDeviceType::GNA) {
+                        elementType = ngraph::element::f32;
+                        f_operandValues.resize(operandValues.size());
+                        deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
+                    }
+                    else {
+                        elementType = ngraph::element::i16;
+                        input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
+		    }
                     break;
                 }
                 default: {
@@ -91,12 +145,14 @@ class OperationsBase {
                     return nullptr;
                 }
             }
-
+            if (sPluginType == IntelDeviceType::GNA && operandType != OperandType::TENSOR_FLOAT32) {
+                input = createConstNode(elementType, toNgraphShape(operandDims), f_operandValues);
+            }
         } else {
             input = mNgraphNodes->getOperationOutput(operandIndex).get_node_shared_ptr();
         }
-
-        if (operandType == OperandType::TENSOR_QUANT8_ASYMM && dequantize) {
+        if (operandType != OperandType::TENSOR_FLOAT32 && dequantize
+                && sPluginType != IntelDeviceType::GNA && !sModelInfo->isOperandLifeTimeTemp(operandIndex)) {
             input = DequantizeNode(input, operandIndex, ngraph::element::f32);
         }
 

diff --git a/ngraph_creator/operations/include/QuantizedLSTM.hpp b/ngraph_creator/operations/include/QuantizedLSTM.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <OperationsBase.hpp>
+
+namespace android {
+namespace hardware {
+namespace neuralnetworks {
+namespace nnhal {
+
+class QuantizedLSTM : public OperationsBase {
+public:
+    QuantizedLSTM(int operationIndex);
+    bool validate() override;
+    std::shared_ptr<ngraph::Node> createNode() override;
+    void connectOperationToGraph() override;
+
+    std::shared_ptr<ngraph::Node> add(const ngraph::Output<ngraph::Node>& lhs,
+                                      const ngraph::Output<ngraph::Node>& rhs);
+    std::shared_ptr<ngraph::Node> sub(const ngraph::Output<ngraph::Node>& lhs,
+                                      const ngraph::Output<ngraph::Node>& rhs);
+    std::shared_ptr<ngraph::Node> mul(const ngraph::Output<ngraph::Node>& lhs,
+                                      const ngraph::Output<ngraph::Node>& rhs);
+    std::shared_ptr<ngraph::Node> matMul(const ngraph::Output<ngraph::Node>& lhs,
+                                         const ngraph::Output<ngraph::Node>& rhs,
+                                         bool transpose_lhs, bool transpose_rhs);
+    std::shared_ptr<ngraph::Node> clip(const ngraph::Output<ngraph::Node>& data,
+                                       float m_clip) const;
+    std::shared_ptr<ngraph::Node> applyActivation(const std::shared_ptr<ngraph::Node>& arg,
+                                                  int activationFn) const;
+    std::shared_ptr<ngraph::Node> LayerNorm(const ngraph::Output<ngraph::Node>& input,
+                                            const std::shared_ptr<ngraph::Node>& normalizedweights,
+                                            const std::shared_ptr<ngraph::Node>& bias);
+
+    bool isValidInputTensor(uint32_t inputIndex);
+};
+
+}  // namespace nnhal
+}  // namespace neuralnetworks
+}  // namespace hardware
+}  // namespace android