Skip to content
This repository has been archived by the owner on Jul 18, 2024. It is now read-only.

Commit

Permalink
Add support for QuantizedLSTM operation
Browse files Browse the repository at this point in the history
This change also adds support for dequantization for
GNA as GNAPlugin does not support Convert operation.

TEST=asr-perf-eval runs succesfully for encoder
      0 and encoder 1
  • Loading branch information
anikulk committed Sep 12, 2021
1 parent 05893b1 commit d767a31
Show file tree
Hide file tree
Showing 16 changed files with 717 additions and 31 deletions.
6 changes: 4 additions & 2 deletions BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ shared_library("intel_nnhal") {
"ngraph_creator/operations/src/Pad_V2.cpp",
"ngraph_creator/operations/src/Pow.cpp",
"ngraph_creator/operations/src/Quantize.cpp",
"ngraph_creator/operations/src/QuantizedLSTM.cpp",
"ngraph_creator/operations/src/Reduce_All.cpp",
"ngraph_creator/operations/src/Reduce_Any.cpp",
"ngraph_creator/operations/src/Reduce_Max.cpp",
Expand Down Expand Up @@ -194,9 +195,10 @@ shared_library("intel_nnhal") {
"nnapi-support",
"ngraph",
"inference_engine",
"nn-common",
"nn-common",
"ssl",
"crypto"
"crypto",
"MKLDNNPlugin"
]
lib_dirs = [
"${sysroot}/usr/local/deployment_tools/inference_engine/lib/intel64/",
Expand Down
43 changes: 39 additions & 4 deletions BasePreparedModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod
operandType == OperandType::TENSOR_QUANT8_SYMM ||
operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL)
expectedLength /= 4; // 8bit expected instead of 32bit
else if(operandType == OperandType::TENSOR_QUANT16_SYMM)
expectedLength /= 2; // 16bit expected instead of 32bit
if (rActualLength != expectedLength) {
ALOGE("%s Invalid length at outIndex(%d) Actual:%d Expected:%d", __func__, outIndex,
rActualLength, expectedLength);
Expand All @@ -203,12 +205,26 @@ void asyncExecute(const Request& request, MeasureTiming measure, BasePreparedMod
break;
}
case OperandType::TENSOR_QUANT8_ASYMM: {
floatToUint8(srcBlob->buffer().as<float*>(), (uint8_t*)destPtr, srcBlob->size());
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((uint8_t*)destPtr + i) = static_cast<uint8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
case OperandType::TENSOR_QUANT8_SYMM:
case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
floatToint8(srcBlob->buffer().as<float*>(), (int8_t*)destPtr, srcBlob->size());
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((int8_t*)destPtr + i) = static_cast<int8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
case OperandType::TENSOR_QUANT16_SYMM: {
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((int16_t*)destPtr + i) = static_cast<int16_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
default:
Expand Down Expand Up @@ -295,16 +311,21 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
auto outDims = srcBlob->getTensorDesc().getDims();
if (operandType == OperandType::TENSOR_BOOL8 ||
operandType == OperandType::TENSOR_QUANT8_ASYMM ||
operandType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED ||
operandType == OperandType::TENSOR_QUANT8_SYMM ||
operandType == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL)
expectedLength /= 4; // 8bit expected instead of 32bit
else if(operandType == OperandType::TENSOR_QUANT16_SYMM)
expectedLength /= 2; // 16bit expected instead of 32bit
if (rActualLength != expectedLength) {
ALOGE("%s Invalid length(%d) at outIndex(%d)", __func__, rActualLength, outIndex);
// Notify Insufficient Buffer Length to modelInfo
modelInfo->updateOutputshapes(i, outDims, false);
return {ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, modelInfo->getOutputShapes(), kNoTiming};
} else
modelInfo->updateOutputshapes(i, outDims);
float sc;
int32_t zp;
switch (operandType) {
case OperandType::TENSOR_INT32:
case OperandType::TENSOR_FLOAT32: {
Expand All @@ -317,12 +338,26 @@ static std::tuple<ErrorStatus, hidl_vec<V1_2::OutputShape>, Timing> executeSynch
break;
}
case OperandType::TENSOR_QUANT8_ASYMM: {
floatToUint8(srcBlob->buffer().as<float*>(), (uint8_t*)destPtr, srcBlob->size());
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((uint8_t*)destPtr + i) = static_cast<uint8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
case OperandType::TENSOR_QUANT8_SYMM:
case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
floatToint8(srcBlob->buffer().as<float*>(), (int8_t*)destPtr, srcBlob->size());
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((int8_t*)destPtr + i) = static_cast<int8_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
case OperandType::TENSOR_QUANT16_SYMM: {
modelInfo->getOperandScaleZeroPoint(outIndex, sc, zp);
for (int i = 0; i < srcBlob->size() ; i++) {
*((int16_t*)destPtr + i) = static_cast<int16_t>(zp + (*(srcBlob->buffer().as<float*>() + i) / sc));
}
break;
}
default:
Expand Down
22 changes: 21 additions & 1 deletion ModelManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ bool NnapiModelInfo::initializeRunTimeOperandInfo() {
case OperandType::TENSOR_QUANT8_ASYMM:
case OperandType::TENSOR_QUANT8_SYMM:
case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL:
case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
case OperandType::TENSOR_QUANT16_SYMM:
to.type = from.type;
to.scale = from.scale;
break;
default:
ALOGE("wrong operand type %d", from.type);
Expand Down Expand Up @@ -284,7 +287,8 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui
return blob;
}
} else if (op.type == OperandType::TENSOR_QUANT8_SYMM ||
op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
op.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL ||
op.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
ALOGV(
"check if tensors of type TENSOR_QUANT8_SYMM/TENSOR_QUANT8_SYMM_PER_CHANNEL "
"supported");
Expand All @@ -302,6 +306,22 @@ Blob::Ptr NnapiModelInfo::GetInOutOperandAsBlob(RunTimeOperandInfo& op, const ui
return blob;
}
}
else if (op.type == OperandType::TENSOR_QUANT16_SYMM) {
ALOGV("check if tensors of type TENSOR_QUANT16_SYMM supported");
InferenceEngine::TensorDesc td(InferenceEngine::Precision::I16, toDims(op.dimensions),
InferenceEngine::Layout::ANY);
if (buf == nullptr) {
ALOGD("TENSOR_QUANT16_SYMM buf is NULL !!!!!!!!!!!!!!!");
InferenceEngine::TBlob<int16_t>::Ptr blob =
std::make_shared<InferenceEngine::TBlob<int16_t>>(td);
blob->allocate();
return blob;
} else {
InferenceEngine::TBlob<int16_t>::Ptr blob =
std::make_shared<InferenceEngine::TBlob<int16_t>>(td, (int16_t*)buf, len);
return blob;
}
}
return nullptr;
}

Expand Down
7 changes: 7 additions & 0 deletions ModelManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ class NnapiModelInfo {
return operand.zeroPoint;
}

void getOperandScaleZeroPoint(int index, float& scale, int32_t& zp) {
auto operand = getOperand(index);
scale = operand.scale;
zp = operand.zeroPoint;
return;
}

RunTimeOperandInfo& getRuntimeOperand(uint32_t index) {
return mOperands[mModel.main.inputIndexes[index]];
}
Expand Down
16 changes: 11 additions & 5 deletions gna/GnaPreparedModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,21 @@ bool GnaPreparedModel::initialize(const Model& model) {
ALOGE("%s ngraph generation failed", __func__);
return false;
}
auto ngraph_net = std::make_shared<InferenceEngine::CNNNetwork>(ngraph_function);
try {
auto ngraph_net = std::make_shared<InferenceEngine::CNNNetwork>(ngraph_function);
#if __ANDROID__
ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml",
ngraph_net->serialize("/data/vendor/neuralnetworks/ngraph_ir.xml",
"/data/vendor/neuralnetworks/ngraph_ir.bin");
#else
ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin");
ngraph_net->serialize("/tmp/ngraph_ir.xml", "/tmp/ngraph_ir.bin");
#endif
mPlugin = std::make_shared<IENetwork>(ngraph_net);
mPlugin->loadNetwork();
mPlugin = std::make_shared<IENetwork>(ngraph_net);
mPlugin->loadNetwork();
} catch (const std::exception& ex) {
ALOGE("%s Exception !!! %s", __func__, ex.what());
return false;
}


ALOGV("Exiting %s", __func__);
return true;
Expand Down
1 change: 1 addition & 0 deletions ngraph_creator/include/NgraphNetworkCreator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class NgraphNetworkCreator {
std::vector<std::shared_ptr<OperationsBase>> mOperationNodes;
std::shared_ptr<NgraphNodes> mNgraphNodes;
OperationsFactory mOpFactoryInstance;
const IntelDeviceType mPluginType;
bool createInputParams();
bool initializeModel();

Expand Down
5 changes: 4 additions & 1 deletion ngraph_creator/include/NgraphNodes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class NgraphNodes {
// in the path to current Operand.
std::vector<bool> mForcedNchw;
std::vector<std::shared_ptr<ngraph::opset3::Parameter>> mInputParams;
std::vector<std::shared_ptr<ngraph::Node>> mResultNodes;
std::vector<std::shared_ptr<ngraph::op::Result>> mResultNodes;
std::vector<std::shared_ptr<ngraph::op::Sink>> mSinkNodes;
// mNodeNames are only populated when requested, as only Inputs and Result NodeNames are
// required.
std::map<int, std::string> mNodeNames;
Expand All @@ -30,6 +31,8 @@ class NgraphNodes {
void setOutputAtOperandIndex(size_t index, ngraph::Output<ngraph::Node> output);
ngraph::Output<ngraph::Node> getOperationOutput(size_t index);
void setResultNode(size_t outputIndex, std::shared_ptr<ngraph::Node> resultNode);
void setSinkNode(std::shared_ptr<ngraph::op::Sink> sinkNode);


const std::string& getNodeName(size_t index);
void removeInputParameter(std::string name, size_t index);
Expand Down
1 change: 1 addition & 0 deletions ngraph_creator/include/OperationsFactory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include <Pad_V2.hpp>
#include <Pow.hpp>
#include <Quantize.hpp>
#include <QuantizedLSTM.hpp>
#include <RNN.hpp>
#include <ROI_Align.hpp>
#include <ROI_Pooling.hpp>
Expand Down
74 changes: 65 additions & 9 deletions ngraph_creator/operations/include/OperationsBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class OperationsBase {
// override createNodeForPlugin in case sPluginType specific implementation is required
virtual std::shared_ptr<ngraph::Node> createNodeForPlugin();
void addResultNode(size_t index, std::shared_ptr<ngraph::Node> resultNode);
void addSinkNode(std::shared_ptr<ngraph::op::Sink> sinkNode);

// helper functions
bool checkOperandType(uint32_t operandIndex, const int32_t expectedOperandType,
Expand All @@ -46,12 +47,30 @@ class OperationsBase {
const vec<uint32_t> getInputOperandDimensions(uint32_t inputIndex);
bool isValidInputTensor(uint32_t inputIndex);

template<typename T>
bool deQuantize(const T* inputData, const uint32_t& len, const float scale,
const int32_t zeroPoint, float* outputData) {
int32_t value;
for (int i = 0; i < len; ++i) {
value = *(inputData + i);
outputData[i] = static_cast<float>(scale * (value - zeroPoint));
}
return true;
}

std::shared_ptr<ngraph::Node> getInputNode(uint32_t inputIndex, bool dequantize = true) {
std::shared_ptr<ngraph::Node> input;
auto operandIndex = sModelInfo->getOperationInput(mNnapiOperationIndex, inputIndex);
auto operandType = sModelInfo->getOperandType(operandIndex);
float scale;
int32_t zp;
if (sModelInfo->isOperandLifeTimeConst(operandIndex)) {
auto operandDims = getInputOperandDimensions(inputIndex);
std::vector<float> f_operandValues;

if (sPluginType == IntelDeviceType::GNA) {
sModelInfo->getOperandScaleZeroPoint(operandIndex, scale, zp);
}
ngraph::element::Type elementType;
switch (operandType) {
case OperandType::TENSOR_FLOAT32: {
Expand All @@ -61,9 +80,16 @@ class OperationsBase {
break;
}
case OperandType::TENSOR_INT32: {
elementType = ngraph::element::i32;
auto operandValues = sModelInfo->GetConstVecOperand<int>(operandIndex);
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
if (sPluginType == IntelDeviceType::GNA) {
elementType = ngraph::element::f32;
f_operandValues.resize(operandValues.size());
deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
}
else {
elementType = ngraph::element::i32;
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
}
break;
}
case OperandType::TENSOR_BOOL8: {
Expand All @@ -73,16 +99,44 @@ class OperationsBase {
break;
}
case OperandType::TENSOR_QUANT8_ASYMM: {
elementType = ngraph::element::u8;
auto operandValues = sModelInfo->GetConstVecOperand<uint8_t>(operandIndex);
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
if (sPluginType == IntelDeviceType::GNA) {
elementType = ngraph::element::f32;
f_operandValues.resize(operandValues.size());
deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
}
else {
elementType = ngraph::element::u8;
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
}
break;
}
case OperandType::TENSOR_QUANT8_SYMM:
case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
case OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL: {
elementType = ngraph::element::i8;
auto operandValues = sModelInfo->GetConstVecOperand<int8_t>(operandIndex);
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
if (sPluginType == IntelDeviceType::GNA) {
elementType = ngraph::element::f32;
f_operandValues.resize(operandValues.size());
deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
}
else {
elementType = ngraph::element::i8;
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
}
break;
}
case OperandType::TENSOR_QUANT16_SYMM: {
auto operandValues = sModelInfo->GetConstVecOperand<int16_t>(operandIndex);
if (sPluginType == IntelDeviceType::GNA) {
elementType = ngraph::element::f32;
f_operandValues.resize(operandValues.size());
deQuantize(operandValues.data(), operandValues.size(), scale, zp, f_operandValues.data());
}
else {
elementType = ngraph::element::i16;
input = createConstNode(elementType, toNgraphShape(operandDims), operandValues);
}
break;
}
default: {
Expand All @@ -91,12 +145,14 @@ class OperationsBase {
return nullptr;
}
}

if (sPluginType == IntelDeviceType::GNA && operandType != OperandType::TENSOR_FLOAT32) {
input = createConstNode(elementType, toNgraphShape(operandDims), f_operandValues);
}
} else {
input = mNgraphNodes->getOperationOutput(operandIndex).get_node_shared_ptr();
}

if (operandType == OperandType::TENSOR_QUANT8_ASYMM && dequantize) {
if (operandType != OperandType::TENSOR_FLOAT32 && dequantize
&& sPluginType != IntelDeviceType::GNA && !sModelInfo->isOperandLifeTimeTemp(operandIndex)) {
input = DequantizeNode(input, operandIndex, ngraph::element::f32);
}

Expand Down
40 changes: 40 additions & 0 deletions ngraph_creator/operations/include/QuantizedLSTM.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#pragma once

#include <OperationsBase.hpp>

namespace android {
namespace hardware {
namespace neuralnetworks {
namespace nnhal {

class QuantizedLSTM : public OperationsBase {
public:
QuantizedLSTM(int operationIndex);
bool validate() override;
std::shared_ptr<ngraph::Node> createNode() override;
void connectOperationToGraph() override;

std::shared_ptr<ngraph::Node> add(const ngraph::Output<ngraph::Node>& lhs,
const ngraph::Output<ngraph::Node>& rhs);
std::shared_ptr<ngraph::Node> sub(const ngraph::Output<ngraph::Node>& lhs,
const ngraph::Output<ngraph::Node>& rhs);
std::shared_ptr<ngraph::Node> mul(const ngraph::Output<ngraph::Node>& lhs,
const ngraph::Output<ngraph::Node>& rhs);
std::shared_ptr<ngraph::Node> matMul(const ngraph::Output<ngraph::Node>& lhs,
const ngraph::Output<ngraph::Node>& rhs,
bool transpose_lhs, bool transpose_rhs);
std::shared_ptr<ngraph::Node> clip(const ngraph::Output<ngraph::Node>& data,
float m_clip) const;
std::shared_ptr<ngraph::Node> applyActivation(const std::shared_ptr<ngraph::Node>& arg,
int activationFn) const;
std::shared_ptr<ngraph::Node> LayerNorm(const ngraph::Output<ngraph::Node>& input,
const std::shared_ptr<ngraph::Node>& normalizedweights,
const std::shared_ptr<ngraph::Node>& bias);

bool isValidInputTensor(uint32_t inputIndex);
};

} // namespace nnhal
} // namespace neuralnetworks
} // namespace hardware
} // namespace android
Loading

0 comments on commit d767a31

Please sign in to comment.