From d767a31038afecdeb50e3195b1628df5f0f609dc Mon Sep 17 00:00:00 2001 From: Anisha Kulkarni Date: Wed, 8 Sep 2021 18:09:03 -0700 Subject: [PATCH] Add support for QuantizedLSTM operation This change also adds support for dequantization for GNA as GNAPlugin does not support Convert operation. TEST=asr-perf-eval runs succesfully for encoder 0 and encoder 1 --- BUILD.gn | 6 +- BasePreparedModel.cpp | 43 +- ModelManager.cpp | 22 +- ModelManager.h | 7 + gna/GnaPreparedModel.cpp | 16 +- .../include/NgraphNetworkCreator.hpp | 1 + ngraph_creator/include/NgraphNodes.hpp | 5 +- ngraph_creator/include/OperationsFactory.hpp | 1 + .../operations/include/OperationsBase.hpp | 74 ++- .../operations/include/QuantizedLSTM.hpp | 40 ++ .../operations/src/OperationsBase.cpp | 4 + .../operations/src/QuantizedLSTM.cpp | 471 ++++++++++++++++++ ngraph_creator/src/NgraphNetworkCreator.cpp | 46 +- ngraph_creator/src/NgraphNodes.cpp | 9 +- ngraph_creator/src/OperationsFactory.cpp | 2 + utils.h | 1 + 16 files changed, 717 insertions(+), 31 deletions(-) create mode 100644 ngraph_creator/operations/include/QuantizedLSTM.hpp create mode 100644 ngraph_creator/operations/src/QuantizedLSTM.cpp diff --git a/BUILD.gn b/BUILD.gn index 7b73d1ad5..806ca569d 100755 --- a/BUILD.gn +++ b/BUILD.gn @@ -118,6 +118,7 @@ shared_library("intel_nnhal") { "ngraph_creator/operations/src/Pad_V2.cpp", "ngraph_creator/operations/src/Pow.cpp", "ngraph_creator/operations/src/Quantize.cpp", + "ngraph_creator/operations/src/QuantizedLSTM.cpp", "ngraph_creator/operations/src/Reduce_All.cpp", "ngraph_creator/operations/src/Reduce_Any.cpp", "ngraph_creator/operations/src/Reduce_Max.cpp", @@ -194,9 +195,10 @@ shared_library("intel_nnhal") { "nnapi-support", "ngraph", "inference_engine", - "nn-common", + "nn-common", "ssl", - "crypto" + "crypto", + "MKLDNNPlugin" ] lib_dirs = [ "${sysroot}/usr/local/deployment_tools/inference_engine/lib/intel64/", diff --git a/BasePreparedModel.cpp b/BasePreparedModel.cpp index 3c2513aff..58f39f429 100644 --- a/BasePreparedModel.cpp +++ b/BasePreparedModel.cpp @@ -180,6 +180,8 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod operandType == OperandType::TENSOR_QUANT8_SYMM || operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) expectedLength /= 4; // 8bit expected instead of 32bit + else if(operandType == OperandType::TENSOR_QUANT16_SYMM) + expectedLength /= 2; // 16bit expected instead of 32bit if (rActualLength != expectedLength) { ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex, rActualLength, expectedLength); @@ -203,12 +205,26 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod break; } case OperandType::TENSOR_QUANT8_ASYMM: { - floatToUint8(srcBlob->buffer().as(), (uint8_t*)destPtr, srcBlob->size()); + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((uint8_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } break; } case OperandType::TENSOR_QUANT8_SYMM: + case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: { - floatToint8(srcBlob->buffer().as(), (int8_t*)destPtr, srcBlob->size()); + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((int8_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } + break; + } + case OperandType::TENSOR_QUANT16_SYMM: { + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((int16_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } break; } default: @@ -295,9 +311,12 @@ static std::tuple, Timing> executeSynch auto outDims = srcBlob->getTensorDesc().getDims(); if (operandType == OperandType::TENSOR_BOOL8 || operandType == OperandType::TENSOR_QUANT8_ASYMM || + operandType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED || operandType == OperandType::TENSOR_QUANT8_SYMM || operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) expectedLength /= 4; // 8bit expected instead of 32bit + else if(operandType == OperandType::TENSOR_QUANT16_SYMM) + expectedLength /= 2; // 16bit expected instead of 32bit if (rActualLength != expectedLength) { ALOGE("%s Invalid length(%d) at outIndex(%d)", __func__, rActualLength, outIndex); // Notify Insufficient Buffer Length to modelInfo @@ -305,6 +324,8 @@ static std::tuple, Timing> executeSynch return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming}; } else modelInfo->updateOutputshapes(i, outDims); + float sc; + int32_t zp; switch (operandType) { case OperandType::TENSOR_INT32: case OperandType::TENSOR_FLOAT32: { @@ -317,12 +338,26 @@ static std::tuple, Timing> executeSynch break; } case OperandType::TENSOR_QUANT8_ASYMM: { - floatToUint8(srcBlob->buffer().as(), (uint8_t*)destPtr, srcBlob->size()); + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((uint8_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } break; } case OperandType::TENSOR_QUANT8_SYMM: + case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: { - floatToint8(srcBlob->buffer().as(), (int8_t*)destPtr, srcBlob->size()); + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((int8_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } + break; + } + case OperandType::TENSOR_QUANT16_SYMM: { + modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp); + for (int i = 0; i < srcBlob->size() ; i++) { + *((int16_t*)destPtr + i) = static_cast(zp + (*(srcBlob->buffer().as() + i) / sc)); + } break; } default: diff --git a/ModelManager.cpp b/ModelManager.cpp index 3ec23e1b6..38eee117e 100755 --- a/ModelManager.cpp +++ b/ModelManager.cpp @@ -66,7 +66,10 @@ bool NnapiModelInfo::initializeRunTimeOperandInfo() { case OperandType::TENSOR_QUANT8_ASYMM: case OperandType::TENSOR_QUANT8_SYMM: case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: + case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: + case OperandType::TENSOR_QUANT16_SYMM: to.type = from.type; + to.scale = from.scale; break; default: ALOGE("wrong operand type %d", from.type); @@ -284,7 +287,8 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui return blob; } } else if (op.type == OperandType::TENSOR_QUANT8_SYMM || - op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) { + op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL || + op.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) { ALOGV( "check if tensors of type TENSOR_QUANT8_SYMM/TENSOR_QUANT8_SYMM_PER_CHANNEL " "supported"); @@ -302,6 +306,22 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui return blob; } } + else if (op.type == OperandType::TENSOR_QUANT16_SYMM) { + ALOGV("check if tensors of type TENSOR_QUANT16_SYMM supported"); + InferenceEngine::TensorDesc td(InferenceEngine::Precision::I16, toDims(op.dimensions), + InferenceEngine::Layout::ANY); + if (buf == nullptr) { + ALOGD("TENSOR_QUANT16_SYMM buf is NULL !!!!!!!!!!!!!!!"); + InferenceEngine::TBlob::Ptr blob = + std::make_shared>(td); + blob->allocate(); + return blob; + } else { + InferenceEngine::TBlob::Ptr blob = + std::make_shared>(td, (int16_t*)buf, len); + return blob; + } + } return nullptr; } diff --git a/ModelManager.h b/ModelManager.h index 8d9feff74..33f1dbe8c 100755 --- a/ModelManager.h +++ b/ModelManager.h @@ -104,6 +104,13 @@ class NnapiModelInfo { return operand.zeroPoint; } + void getOperandScaleZeroPoint(int index, float& scale, int32_t& zp) { + auto operand = getOperand(index); + scale = operand.scale; + zp = operand.zeroPoint; + return; + } + RunTimeOperandInfo& getRuntimeOperand(uint32_t index) { return mOperands[mModel.main.inputIndexes[index]]; } diff --git a/gna/GnaPreparedModel.cpp b/gna/GnaPreparedModel.cpp index d9e8cf50f..b24c2ac5f 100755 --- a/gna/GnaPreparedModel.cpp +++ b/gna/GnaPreparedModel.cpp @@ -39,15 +39,21 @@ bool GnaPreparedModel::initialize(const Model& model) { ALOGE("%s ngraph generation failed", __func__); return false; } - auto ngraph_net = std::make_shared(ngraph_function); + try { + auto ngraph_net = std::make_shared(ngraph_function); #if __ANDROID__ - ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml", + ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml", "/data/vendor/neuralnetworks/ngraph_ir.bin"); #else - ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin"); + ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin"); #endif - mPlugin = std::make_shared(ngraph_net); - mPlugin->loadNetwork(); + mPlugin = std::make_shared(ngraph_net); + mPlugin->loadNetwork(); + } catch (const std::exception& ex) { + ALOGE("%s Exception !!! %s", __func__, ex.what()); + return false; + } + ALOGV("Exiting %s", __func__); return true; diff --git a/ngraph_creator/include/NgraphNetworkCreator.hpp b/ngraph_creator/include/NgraphNetworkCreator.hpp index c2ff98d6a..af212abbd 100644 --- a/ngraph_creator/include/NgraphNetworkCreator.hpp +++ b/ngraph_creator/include/NgraphNetworkCreator.hpp @@ -17,6 +17,7 @@ class NgraphNetworkCreator { std::vector> mOperationNodes; std::shared_ptr mNgraphNodes; OperationsFactory mOpFactoryInstance; + const IntelDeviceType mPluginType; bool createInputParams(); bool initializeModel(); diff --git a/ngraph_creator/include/NgraphNodes.hpp b/ngraph_creator/include/NgraphNodes.hpp index a82791838..1f24788dd 100644 --- a/ngraph_creator/include/NgraphNodes.hpp +++ b/ngraph_creator/include/NgraphNodes.hpp @@ -17,7 +17,8 @@ class NgraphNodes { // in the path to current Operand. std::vector mForcedNchw; std::vector> mInputParams; - std::vector> mResultNodes; + std::vector> mResultNodes; + std::vector> mSinkNodes; // mNodeNames are only populated when requested, as only Inputs and Result NodeNames are // required. std::map mNodeNames; @@ -30,6 +31,8 @@ class NgraphNodes { void setOutputAtOperandIndex(size_t index, ngraph::Output output); ngraph::Output getOperationOutput(size_t index); void setResultNode(size_t outputIndex, std::shared_ptr resultNode); + void setSinkNode(std::shared_ptr sinkNode); + const std::string& getNodeName(size_t index); void removeInputParameter(std::string name, size_t index); diff --git a/ngraph_creator/include/OperationsFactory.hpp b/ngraph_creator/include/OperationsFactory.hpp index b0cd08fc8..16b2ebcb6 100644 --- a/ngraph_creator/include/OperationsFactory.hpp +++ b/ngraph_creator/include/OperationsFactory.hpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include diff --git a/ngraph_creator/operations/include/OperationsBase.hpp b/ngraph_creator/operations/include/OperationsBase.hpp index e637d0827..4ab83ffca 100644 --- a/ngraph_creator/operations/include/OperationsBase.hpp +++ b/ngraph_creator/operations/include/OperationsBase.hpp @@ -37,6 +37,7 @@ class OperationsBase { // override createNodeForPlugin in case sPluginType specific implementation is required virtual std::shared_ptr createNodeForPlugin(); void addResultNode(size_t index, std::shared_ptr resultNode); + void addSinkNode(std::shared_ptr sinkNode); // helper functions bool checkOperandType(uint32_t operandIndex, const int32_t expectedOperandType, @@ -46,12 +47,30 @@ class OperationsBase { const vec getInputOperandDimensions(uint32_t inputIndex); bool isValidInputTensor(uint32_t inputIndex); + template + bool deQuantize(const T* inputData, const uint32_t& len, const float scale, + const int32_t zeroPoint, float* outputData) { + int32_t value; + for (int i = 0; i < len; ++i) { + value = *(inputData + i); + outputData[i] = static_cast(scale * (value - zeroPoint)); + } + return true; + } + std::shared_ptr getInputNode(uint32_t inputIndex, bool dequantize = true) { std::shared_ptr input; auto operandIndex = sModelInfo->getOperationInput(mNnapiOperationIndex, inputIndex); auto operandType = sModelInfo->getOperandType(operandIndex); + float scale; + int32_t zp; if (sModelInfo->isOperandLifeTimeConst(operandIndex)) { auto operandDims = getInputOperandDimensions(inputIndex); + std::vector f_operandValues; + + if (sPluginType == IntelDeviceType::GNA) { + sModelInfo->getOperandScaleZeroPoint(operandIndex, scale, zp); + } ngraph::element::Type elementType; switch (operandType) { case OperandType::TENSOR_FLOAT32: { @@ -61,9 +80,16 @@ class OperationsBase { break; } case OperandType::TENSOR_INT32: { - elementType = ngraph::element::i32; auto operandValues = sModelInfo->GetConstVecOperand(operandIndex); - input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + if (sPluginType == IntelDeviceType::GNA) { + elementType = ngraph::element::f32; + f_operandValues.resize(operandValues.size()); + deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data()); + } + else { + elementType = ngraph::element::i32; + input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + } break; } case OperandType::TENSOR_BOOL8: { @@ -73,16 +99,44 @@ class OperationsBase { break; } case OperandType::TENSOR_QUANT8_ASYMM: { - elementType = ngraph::element::u8; auto operandValues = sModelInfo->GetConstVecOperand(operandIndex); - input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + if (sPluginType == IntelDeviceType::GNA) { + elementType = ngraph::element::f32; + f_operandValues.resize(operandValues.size()); + deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data()); + } + else { + elementType = ngraph::element::u8; + input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + } break; } case OperandType::TENSOR_QUANT8_SYMM: + case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: { - elementType = ngraph::element::i8; auto operandValues = sModelInfo->GetConstVecOperand(operandIndex); - input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + if (sPluginType == IntelDeviceType::GNA) { + elementType = ngraph::element::f32; + f_operandValues.resize(operandValues.size()); + deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data()); + } + else { + elementType = ngraph::element::i8; + input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + } + break; + } + case OperandType::TENSOR_QUANT16_SYMM: { + auto operandValues = sModelInfo->GetConstVecOperand(operandIndex); + if (sPluginType == IntelDeviceType::GNA) { + elementType = ngraph::element::f32; + f_operandValues.resize(operandValues.size()); + deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data()); + } + else { + elementType = ngraph::element::i16; + input = createConstNode(elementType, toNgraphShape(operandDims), operandValues); + } break; } default: { @@ -91,12 +145,14 @@ class OperationsBase { return nullptr; } } - + if (sPluginType == IntelDeviceType::GNA && operandType != OperandType::TENSOR_FLOAT32) { + input = createConstNode(elementType, toNgraphShape(operandDims), f_operandValues); + } } else { input = mNgraphNodes->getOperationOutput(operandIndex).get_node_shared_ptr(); } - - if (operandType == OperandType::TENSOR_QUANT8_ASYMM && dequantize) { + if (operandType != OperandType::TENSOR_FLOAT32 && dequantize + && sPluginType != IntelDeviceType::GNA && !sModelInfo->isOperandLifeTimeTemp(operandIndex)) { input = DequantizeNode(input, operandIndex, ngraph::element::f32); } diff --git a/ngraph_creator/operations/include/QuantizedLSTM.hpp b/ngraph_creator/operations/include/QuantizedLSTM.hpp new file mode 100644 index 000000000..944ee478b --- /dev/null +++ b/ngraph_creator/operations/include/QuantizedLSTM.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace android { +namespace hardware { +namespace neuralnetworks { +namespace nnhal { + +class QuantizedLSTM : public OperationsBase { +public: + QuantizedLSTM(int operationIndex); + bool validate() override; + std::shared_ptr createNode() override; + void connectOperationToGraph() override; + + std::shared_ptr add(const ngraph::Output& lhs, + const ngraph::Output& rhs); + std::shared_ptr sub(const ngraph::Output& lhs, + const ngraph::Output& rhs); + std::shared_ptr mul(const ngraph::Output& lhs, + const ngraph::Output& rhs); + std::shared_ptr matMul(const ngraph::Output& lhs, + const ngraph::Output& rhs, + bool transpose_lhs, bool transpose_rhs); + std::shared_ptr clip(const ngraph::Output& data, + float m_clip) const; + std::shared_ptr applyActivation(const std::shared_ptr& arg, + int activationFn) const; + std::shared_ptr LayerNorm(const ngraph::Output& input, + const std::shared_ptr& normalizedweights, + const std::shared_ptr& bias); + + bool isValidInputTensor(uint32_t inputIndex); +}; + +} // namespace nnhal +} // namespace neuralnetworks +} // namespace hardware +} // namespace android \ No newline at end of file diff --git a/ngraph_creator/operations/src/OperationsBase.cpp b/ngraph_creator/operations/src/OperationsBase.cpp index b7c66d785..dc96cdb7b 100755 --- a/ngraph_creator/operations/src/OperationsBase.cpp +++ b/ngraph_creator/operations/src/OperationsBase.cpp @@ -66,6 +66,10 @@ void OperationsBase::addResultNode(size_t index, std::shared_ptr r mNgraphNodes->setResultNode(index, resultNode); } +void OperationsBase::addSinkNode(std::shared_ptr sinkNode) { + mNgraphNodes->setSinkNode(sinkNode); +} + OperationsBase::OperationsBase(int operationIndex) : mNnapiOperationIndex(operationIndex) { mDefaultOutputIndex = 0; } diff --git a/ngraph_creator/operations/src/QuantizedLSTM.cpp b/ngraph_creator/operations/src/QuantizedLSTM.cpp new file mode 100644 index 000000000..157a27793 --- /dev/null +++ b/ngraph_creator/operations/src/QuantizedLSTM.cpp @@ -0,0 +1,471 @@ +//#define LOG_NDEBUG 0 +#include +#define LOG_TAG "Quantized_LSTM" + +namespace android { +namespace hardware { +namespace neuralnetworks { +namespace nnhal { + +#define ACTIVATION_FUNCTION_NONE 0 +#define ACTIVATION_FUNCTION_RELU 1 +#define ACTIVATION_FUNCTION_RELU6 3 +#define ACTIVATION_FUNCTION_TANH 4 +#define ACTIVATION_FUNCTION_SIGMOID 6 + +QuantizedLSTM::QuantizedLSTM(int operationIndex) : OperationsBase(operationIndex) { + mDefaultOutputIndex = sModelInfo->getOperationOutput(mNnapiOperationIndex, 0); +} + +bool QuantizedLSTM::validate() { + // Check all Output types + if (!checkOutputOperandType(0, (int32_t)OperandType::TENSOR_QUANT8_ASYMM_SIGNED)) return false; + if (!checkOutputOperandType(1, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkOutputOperandType(2, (int32_t)OperandType::TENSOR_QUANT8_ASYMM_SIGNED)) return false; + + const auto& inputsSize = sModelInfo->getOperationInputsSize(mNnapiOperationIndex); + const auto& outputsSize = sModelInfo->getOperationOutputsSize(mNnapiOperationIndex); + + if (inputsSize != 32) { + return false; + } + + if (outputsSize != 3) return false; + + // check 0, 18, 19 input values + if (!checkInputOperandType(0, (int32_t)OperandType::TENSOR_QUANT8_ASYMM_SIGNED)) return false; + if (!checkInputOperandType(18, (int32_t)OperandType::TENSOR_QUANT8_ASYMM_SIGNED)) return false; + if (!checkInputOperandType(19, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + + // check input type for 2 to 4, 6 to 8 + for (int i = 2; i <= 4; i++) { + if (!checkInputOperandType(i, (int32_t)OperandType::TENSOR_QUANT8_SYMM)) return false; + } + for (int i = 6; i <= 8; i++) { + if (!checkInputOperandType(i, (int32_t)OperandType::TENSOR_QUANT8_SYMM)) return false; + } + + // check input type for 13,14,15 + for (int i = 13; i <= 15; i++) { + if (!checkInputOperandType(i, (int32_t)OperandType::TENSOR_INT32)) return false; + } + + if (!sModelInfo->isOmittedInput(mNnapiOperationIndex, 1) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 5) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 12)) { + // CIFG diabled, check input types + if (!checkInputOperandType(1, (int32_t)OperandType::TENSOR_QUANT8_SYMM)) return false; + if (!checkInputOperandType(5, (int32_t)OperandType::TENSOR_QUANT8_SYMM)) return false; + if (!checkInputOperandType(12, (int32_t)OperandType::TENSOR_INT32)) return false; + } + + if (!sModelInfo->isOmittedInput(mNnapiOperationIndex, 9) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 10) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 11)) { + // peephole enabled, check input types + if (!checkInputOperandType(9, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkInputOperandType(10, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkInputOperandType(11, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + } + + if (!sModelInfo->isOmittedInput(mNnapiOperationIndex, 20) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 21) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 22) && + !sModelInfo->isOmittedInput(mNnapiOperationIndex, 23)) { + // Layer Normalization present + if (!checkInputOperandType(20, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkInputOperandType(21, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkInputOperandType(22, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + if (!checkInputOperandType(23, (int32_t)OperandType::TENSOR_QUANT16_SYMM)) return false; + } + + ALOGV("%s PASSED", __func__); + return true; +} + +void QuantizedLSTM::connectOperationToGraph() { createNode(); } + +std::shared_ptr QuantizedLSTM::createNode() { + + const auto& inputsSize = sModelInfo->getOperationInputsSize(mNnapiOperationIndex); + + bool isCIFGenabled = false, isPeepholeUsed = false, isProjectionUsed = false, + isLayerNormUsed = false, isCifgDimsEmpty = true; + + // checking if CIFG enabled + if (sModelInfo->isOmittedInput(mNnapiOperationIndex, 1) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 5) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 12)) { + isCIFGenabled = true; + } else { + if (isValidInputTensor(1) && isValidInputTensor(5) && isValidInputTensor(12)) + isCIFGenabled = false; + else + isCIFGenabled = true; + } + + // checking if peephole enabled + if (sModelInfo->isOmittedInput(mNnapiOperationIndex, 9) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 10) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 11)) { + isPeepholeUsed = false; + } else { + if (!isCIFGenabled && !isValidInputTensor(9) && isValidInputTensor(10) && + isValidInputTensor(11)) { + isCIFGenabled = true; + isCifgDimsEmpty = false; + } + if (isCIFGenabled) { + if (isValidInputTensor(10) && isValidInputTensor(11)) + isPeepholeUsed = true; + else + isPeepholeUsed = false; + } else { + if (isValidInputTensor(9) && isValidInputTensor(10) && isValidInputTensor(11)) + isPeepholeUsed = true; + else + isPeepholeUsed = false; + } + } + + // checking if projection enabled + if (sModelInfo->isOmittedInput(mNnapiOperationIndex, 16)) { + isProjectionUsed = false; + } else { + if (isValidInputTensor(16)) + isProjectionUsed = true; + else + isProjectionUsed = false; + } + + // checking if layer normalization enabled + if (sModelInfo->isOmittedInput(mNnapiOperationIndex, 20) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 21) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 22) && + sModelInfo->isOmittedInput(mNnapiOperationIndex, 23)) { + isLayerNormUsed = false; + } else { + if (isCIFGenabled) { + if (isValidInputTensor(21) && isValidInputTensor(22) && isValidInputTensor(23)) + isLayerNormUsed = true; + else + isLayerNormUsed = false; + } else { + if (isValidInputTensor(20) && isValidInputTensor(21) && isValidInputTensor(22) && + isValidInputTensor(23)) + isLayerNormUsed = true; + else + isLayerNormUsed = false; + } + } + + std::shared_ptr inputNode, input2input_weights, input2forget_weights, + input2cell_weights, input2output_weights, recurrent2input_weights, recurrent2forget_weights, + recurrent2cell_weights, recurrent2output_weights, cell2input_weights, cell2forget_weights, + cell2output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, + projection_weights, projection_bias; + uint32_t activationFn; + float cell_state_clipping, proj_clipping; + + const auto& inputNode_dims = getInputOperandDimensions(0); + const auto& initial_hidden_state_dims = getInputOperandDimensions(18); + const auto& initial_cell_state_dims = getInputOperandDimensions(19); + + auto batch_size = inputNode_dims[0]; + auto input_size = inputNode_dims[1]; + auto num_units = initial_cell_state_dims[1]; + auto output_size = initial_hidden_state_dims[1]; + + // Creating input nodes + inputNode = getInputNode(0); + const auto& elementType = inputNode->get_element_type(); + ngraph::element::Type cellElementType = ngraph::element::f32; + // W_{xi}, W_{xf}, W_{xc}, W_{xo} + if (isCIFGenabled) { + if (!isCifgDimsEmpty) removeInputNode(1); + } else { + input2input_weights = getInputNode(1); + } + input2forget_weights = getInputNode(2); + input2cell_weights = getInputNode(3); + input2output_weights = getInputNode(4); + + // W_{hi}, W_{hf}, W_{hc}, W_{ho} + if (isCIFGenabled) { + if (!isCifgDimsEmpty) removeInputNode(5); + } else { + recurrent2input_weights = getInputNode(5); + } + recurrent2forget_weights = getInputNode(6); + recurrent2cell_weights = getInputNode(7); + recurrent2output_weights = getInputNode(8); + + std::vector init_hidden(output_size, 0.0f); + std::vector init_cell(num_units, 0.0f); + static int assign_read_count = 0; + auto constant_hidden = std::make_shared(ngraph::element::f32, ngraph::Shape{1, output_size}, + init_hidden); + auto constant_cell = std::make_shared(ngraph::element::f32, ngraph::Shape{1, num_units}, + init_cell); + + auto read_value_hidden = std::make_shared(constant_hidden, "variable_hidden_" + std::to_string(assign_read_count)); + auto read_value_cell = std::make_shared(constant_cell, "variable_cell"+ std::to_string(assign_read_count)); + assign_read_count++; + + // W_{ci}, W_{cf}, W_{co} + if (isPeepholeUsed) { + if (isCIFGenabled) + cell2input_weights = + createConstNode(cellElementType, ngraph::Shape{num_units}, convertToVector(0)); + else + cell2input_weights = getInputNode(9); + cell2forget_weights = getInputNode(10); + cell2output_weights = getInputNode(11); + } else { + cell2input_weights = + createConstNode(cellElementType, ngraph::Shape{1, num_units}, convertToVector(0)); + cell2forget_weights = + createConstNode(cellElementType, ngraph::Shape{1, num_units}, convertToVector(0)); + cell2output_weights = + createConstNode(cellElementType, ngraph::Shape{1, num_units}, convertToVector(0)); + } + + // b_i, b_f, b_c, b_o + if (isCIFGenabled) { + if (!isCifgDimsEmpty) removeInputNode(12); + } else { + input_gate_bias = getInputNode(12); + } + forget_gate_bias = getInputNode(13); + cell_bias = getInputNode(14); + output_gate_bias = getInputNode(15); + + // W_{proj}, b_{proj} + if (isProjectionUsed) { + projection_weights = getInputNode(16); + if (isValidInputTensor(17)) + projection_bias = getInputNode(17); + else + projection_bias = + createConstNode(elementType, ngraph::Shape{output_size}, convertToVector(0)); + } + + cell_state_clipping = sModelInfo->ParseOperationInput(mNnapiOperationIndex, 24); + + if (isProjectionUsed) + proj_clipping = sModelInfo->ParseOperationInput(mNnapiOperationIndex, 25); + + std::shared_ptr i_t, f_t, c_t, o_t; + + std::shared_ptr input_layer_norm_weights, forget_layer_norm_weights, + cell_layer_norm_weights, output_layer_norm_weights; + if (isLayerNormUsed) { + if (!isCIFGenabled) input_layer_norm_weights = getInputNode(20); + forget_layer_norm_weights = getInputNode(21); + cell_layer_norm_weights = getInputNode(22); + output_layer_norm_weights = getInputNode(23); + } + + // i_t = W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}C_{t-1} + if (!isCIFGenabled) + i_t = add(add(matMul(inputNode, input2input_weights, false, true), + matMul(read_value_hidden, recurrent2input_weights, false, true)), + mul(cell2input_weights, read_value_cell)); + + // f_t = W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}C_{t-1} + f_t = add(add(matMul(inputNode, input2forget_weights, false, true), + matMul(read_value_hidden, recurrent2forget_weights, false, true)), + mul(cell2forget_weights, read_value_cell)); + // c_t = W_{xc}x_t+W_{hc}h_{t-1} + c_t = add(matMul(inputNode, input2cell_weights, false, true), + matMul(read_value_hidden, recurrent2cell_weights, false, true)); + // o_t = W_{xo}x_t+W_{ho}h_{t-1} + o_t = add(matMul(inputNode, input2output_weights, false, true), + matMul(read_value_hidden, recurrent2output_weights, false, true)); + + /* ################# Update Forget Gate ################# */ + if (isLayerNormUsed) { + f_t = LayerNorm(f_t, forget_layer_norm_weights, forget_gate_bias); + } else { + // W_{xf}x_t + W_{hf}h_{t-1} + W_{cf}C_{t-1} + b_f + f_t = add(f_t, forget_gate_bias); + } + // sigma(W_{xf}x_t + W_{hf}h_{t-1} + W_{cf}C_{t-1} + b_f) + f_t = applyActivation(f_t, ACTIVATION_FUNCTION_SIGMOID); + + /* ################# Update Input Gate ################# */ + if (isCIFGenabled) { + auto constNode = createConstNode(elementType, f_t->get_shape(), convertToVector(1.f)); + // Couple input with forget gate: 1 - i_f + i_t = sub(constNode, f_t); + } else { + if (isLayerNormUsed) { + i_t = LayerNorm(i_t, input_layer_norm_weights, input_gate_bias); + } else { + // W_{xi}x_t + W_{hi}h_{t-1} + W_{ci}C_{t-1} + b_i + i_t = add(i_t, input_gate_bias); + } + // sigma(W_{xi}x_t + W_{hi}h_{t-1} + W_{ci}C_{t-1} + b_i) + i_t = applyActivation(i_t, ACTIVATION_FUNCTION_SIGMOID); + } + + /* ################# Update Cell Gate ################# */ + + if (isLayerNormUsed) { + c_t = LayerNorm(c_t, cell_layer_norm_weights, cell_bias); + } else { + // W_{xc}x_t+W_{hc}h_{t-1}+b_c + c_t = add(c_t, cell_bias); + } + // g(W_{xc}x_t+W_{hc}h_{t-1}+b_c) + c_t = applyActivation(c_t, ACTIVATION_FUNCTION_TANH); + + // ft (.) Ct-1 + it (.) ct + auto C = add(mul(f_t, read_value_cell), mul(i_t, c_t)); + // clip(ft (.) Ct-1 + it (.) ct, t_{cell}) + C = clip(C, cell_state_clipping); + + /* ################# Update Output Gate ################# */ + + // W_{xo}x_t+W_{ho}h_{t-1}+W_{co}C_t + o_t = add(o_t, mul(cell2output_weights, C)); + if (isLayerNormUsed) { + o_t = LayerNorm(o_t, output_layer_norm_weights, output_gate_bias); + } else { + // W_{xo}x_t+W_{ho}h_{t-1}+W_{co}C_t+b_o + o_t = add(o_t, output_gate_bias); + } + + // sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}C_t+b_o) + o_t = applyActivation(o_t, ACTIVATION_FUNCTION_SIGMOID); + + std::shared_ptr H; + if (isProjectionUsed) { + // o_t odot g(C_t) + auto dotProd = mul(o_t, applyActivation(C, ACTIVATION_FUNCTION_TANH)); + // W_{proj}(o_t odot g(C_t)) + auto projWeightsProduct = matMul(projection_weights, dotProd, false, true); + // W_{proj}(o_t odot g(C_t))+b_{proj} + auto projBiasAdd = add(transpose(NC_CN, projWeightsProduct), projection_bias); + // clip(W_{proj}(o_t odot g(C_t))+b_{proj}, t_{proj}) + H = clip(projBiasAdd, proj_clipping); + } else { + // o_t odot g(C_t) + H = mul(o_t, applyActivation(C, ACTIVATION_FUNCTION_TANH)); + } + + std::vector> QLstmOutputs(3, nullptr); + QLstmOutputs[0] = H; + QLstmOutputs[1] = C; + QLstmOutputs[2] = H; + + auto assign_hidden = std::make_shared(H, read_value_hidden->get_variable_id()); + auto assign_cell = std::make_shared(C, read_value_cell->get_variable_id()); + assign_hidden->add_control_dependency(read_value_hidden); + assign_cell->add_control_dependency(read_value_cell); + addSinkNode(assign_hidden); + addSinkNode(assign_cell); + + for (int i = 0; i < 3; i++) { + auto outputIndex = sModelInfo->getOperationOutput(mNnapiOperationIndex, i); + mNgraphNodes->setOutputAtOperandIndex(outputIndex, QLstmOutputs[i]); + + const auto op = sModelInfo->getOperand(outputIndex); + if (op.lifetime == V1_3::OperandLifeTime::SUBGRAPH_OUTPUT) { + addResultNode(outputIndex, QLstmOutputs[i]); + } + } + + return nullptr; +} + +std::shared_ptr QuantizedLSTM::add(const ngraph::Output& lhs, + const ngraph::Output& rhs) { + return {make_shared(lhs, rhs, ngraph::op::AutoBroadcastType::NUMPY)}; +} + +std::shared_ptr QuantizedLSTM::sub(const ngraph::Output& lhs, + const ngraph::Output& rhs) { + return {make_shared(lhs, rhs, ngraph::op::AutoBroadcastType::NUMPY)}; +} + +std::shared_ptr QuantizedLSTM::mul(const ngraph::Output& lhs, + const ngraph::Output& rhs) { + return {make_shared(lhs, rhs, ngraph::op::AutoBroadcastType::NUMPY)}; +} + +std::shared_ptr QuantizedLSTM::matMul(const ngraph::Output& lhs, + const ngraph::Output& rhs, + bool transpose_lhs, bool transpose_rhs) { + return {make_shared(lhs, rhs, transpose_lhs, transpose_rhs)}; +} + +std::shared_ptr QuantizedLSTM::clip(const ngraph::Output& data, + float m_clip) const { + if (m_clip == 0.f) { + return data.get_node_shared_ptr(); + } + return make_shared(data, -m_clip, m_clip); +} + +std::shared_ptr QuantizedLSTM::applyActivation(const std::shared_ptr& arg, + int activationFn) const { + switch (activationFn) { + case ACTIVATION_FUNCTION_RELU: + return std::make_shared(arg); + break; + case ACTIVATION_FUNCTION_RELU6: + return std::make_shared(arg, 0, 6); + break; + case ACTIVATION_FUNCTION_TANH: + return std::make_shared(arg); + break; + case ACTIVATION_FUNCTION_SIGMOID: + return std::make_shared(arg); + break; + default: + return std::make_shared(arg); + } +} + +std::shared_ptr QuantizedLSTM::LayerNorm( + const ngraph::Output& input, + const std::shared_ptr& normalizationweights, + const std::shared_ptr& bias) { + // LayerNormalization + auto normalizationConstant = createConstNode(ngraph::element::f32, {}, convertToVector(1e-8f)); + auto axis = ngraph::op::Constant::create(ngraph::element::i32, {}, {-1}); + auto mean = std::make_shared(input, axis, true); + // x_i - mean_i + auto diff = sub(input, mean); + // (x_i - mean_i) ** 2 + auto multiply = mul(diff, diff); + // mean((x_i - mean_i) ** 2) + auto var = std::make_shared(multiply, axis, true); + // var_i + epsilon + auto add_var = add(var, normalizationConstant); + // sqrt(var_i + epsilon) + auto sqrt = std::make_shared(add_var); + // (x_i - mean_i) / sqrt(var_i + epsilon) + auto stddev_inv = std::make_shared(diff, sqrt); + // x_i_normalized * gamma + auto mul_norm_weights = mul(stddev_inv, normalizationweights); + // x_i_normalized * gamma + beta + auto output = add(mul_norm_weights, bias); + + return output; +} + +bool QuantizedLSTM::isValidInputTensor(uint32_t inputIndex) { + const auto& dims = getInputOperandDimensions(inputIndex); + if (dims.empty()) return false; + + if (dims[0] == 0) return false; + + return true; +} + +} // namespace nnhal +} // namespace neuralnetworks +} // namespace hardware +} // namespace android diff --git a/ngraph_creator/src/NgraphNetworkCreator.cpp b/ngraph_creator/src/NgraphNetworkCreator.cpp index c55ada48c..7535801ff 100644 --- a/ngraph_creator/src/NgraphNetworkCreator.cpp +++ b/ngraph_creator/src/NgraphNetworkCreator.cpp @@ -10,6 +10,7 @@ namespace nnhal { NgraphNetworkCreator::NgraphNetworkCreator(std::shared_ptr modelInfo, IntelDeviceType deviceType) : mModelInfo(modelInfo), + mPluginType(deviceType), mNgraphNodes(std::make_shared(mModelInfo->getOperandsSize(), mModelInfo->getModelOutputsSize())), mOpFactoryInstance(deviceType, mModelInfo, mNgraphNodes) { @@ -49,8 +50,14 @@ bool NgraphNetworkCreator::createInputParams() { break; case OperandType::INT32: case OperandType::TENSOR_INT32: - inputParam = std::make_shared( - ngraph::element::i32, ngraph::Shape(dims.begin(), dims.end())); + if (mPluginType == IntelDeviceType::GNA) { + inputParam = std::make_shared( + ngraph::element::f32, ngraph::Shape(dims.begin(), dims.end())); + } + else { + inputParam = std::make_shared( + ngraph::element::i32, ngraph::Shape(dims.begin(), dims.end())); + } ALOGV("createInputParams created inputIndex %d, type %d", i, nnapiOperand.type); break; @@ -62,18 +69,43 @@ bool NgraphNetworkCreator::createInputParams() { nnapiOperand.type); break; case OperandType::TENSOR_QUANT8_ASYMM: - inputParam = std::make_shared( - ngraph::element::u8, ngraph::Shape(dims.begin(), dims.end())); + if (mPluginType == IntelDeviceType::GNA) { + inputParam = std::make_shared( + ngraph::element::f32, ngraph::Shape(dims.begin(), dims.end())); + } + else { + inputParam = std::make_shared( + ngraph::element::u8, ngraph::Shape(dims.begin(), dims.end())); + } ALOGV("createInputParams created inputIndex %d, type %d", i, nnapiOperand.type); break; case OperandType::TENSOR_QUANT8_SYMM: - case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: - inputParam = std::make_shared( - ngraph::element::i8, ngraph::Shape(dims.begin(), dims.end())); + case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: + case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: + if (mPluginType == IntelDeviceType::GNA) { + inputParam = std::make_shared( + ngraph::element::f32, ngraph::Shape(dims.begin(), dims.end())); + } + else { + inputParam = std::make_shared( + ngraph::element::i8, ngraph::Shape(dims.begin(), dims.end())); + } ALOGV("createInputParams created inputIndex %d, type %d", i, nnapiOperand.type); break; + case OperandType::TENSOR_QUANT16_SYMM: + if (mPluginType == IntelDeviceType::GNA) { + inputParam = std::make_shared( + ngraph::element::f32, ngraph::Shape(dims.begin(), dims.end())); + } + else { + inputParam = std::make_shared( + ngraph::element::i16, ngraph::Shape(dims.begin(), dims.end())); + } + ALOGE("createInputParams created inputIndex %d, type %d", i, + nnapiOperand.type); + break; default: ALOGE("createInputParams Failure at inputIndex %d, type %d", i, nnapiOperand.type); diff --git a/ngraph_creator/src/NgraphNodes.cpp b/ngraph_creator/src/NgraphNodes.cpp index f0aa8f734..16600bd6a 100644 --- a/ngraph_creator/src/NgraphNodes.cpp +++ b/ngraph_creator/src/NgraphNodes.cpp @@ -28,7 +28,11 @@ ngraph::Output NgraphNodes::getOperationOutput(size_t index) { void NgraphNodes::setResultNode(size_t outputIndex, std::shared_ptr resultNode) { ALOGD("setResultNode %zu", outputIndex); - mResultNodes.push_back(resultNode); + mResultNodes.push_back(std::make_shared(resultNode)); +} + +void NgraphNodes::setSinkNode(std::shared_ptr sinkNode) { + mSinkNodes.push_back(sinkNode); } const std::string& NgraphNodes::getNodeName(size_t index) { @@ -50,7 +54,8 @@ void NgraphNodes::removeInputParameter(std::string name, size_t index) { } std::shared_ptr NgraphNodes::generateGraph() { - return std::make_shared(mResultNodes, mInputParams); + ngraph::SinkVector sinks {mSinkNodes}; + return std::make_shared(mResultNodes, sinks, mInputParams); } void NgraphNodes::setInvalidNode(size_t index) { mNodeNames[index] = ""; } diff --git a/ngraph_creator/src/OperationsFactory.cpp b/ngraph_creator/src/OperationsFactory.cpp index f72f9ffb6..d5c13d625 100755 --- a/ngraph_creator/src/OperationsFactory.cpp +++ b/ngraph_creator/src/OperationsFactory.cpp @@ -109,6 +109,8 @@ std::shared_ptr OperationsFactory::getOperation( return std::make_shared(operationIndex); case OperationType::QUANTIZE: return std::make_shared(operationIndex); + case OperationType::QUANTIZED_LSTM: + return std::make_shared(operationIndex); case OperationType::REDUCE_ALL: return std::make_shared(operationIndex); case OperationType::REDUCE_ANY: diff --git a/utils.h b/utils.h index b7eaf57a3..313781ae6 100755 --- a/utils.h +++ b/utils.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "Driver.h" #include "IENetwork.h" // May be move these out of utils??