diff --git a/nntrainer/npu/qnn/LLaMAPackage/Makefile b/nntrainer/npu/qnn/LLaMAPackage/Makefile new file mode 100644 index 000000000..2e86f996f --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/Makefile @@ -0,0 +1,360 @@ + +#============================================================================= +# Copyright (c) 2023 Qualcomm Technologies, Inc. +# All Rights Reserved. +# Confidential and Proprietary - Qualcomm Technologies, Inc. +#============================================================================= + +# users should provide locations for QNN_INCLUDE and HEXAGON_SDK_ROOT +# export HEXAGON_SDK_ROOT = /path/to/hexagon-sdk + +# check all setup prerequisites if the command goal is not clean +ifneq ($(MAKECMDGOALS),clean) +ifndef QNN_INCLUDE +$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN +endif +ifeq ($(wildcard $(QNN_INCLUDE)),) +$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") +endif +ifndef QNN_TARGET_LIB +$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android +endif +ifeq ($(wildcard $(QNN_TARGET_LIB)),) +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") +endif +endif + +ifndef HEXAGON_SDK_ROOT +$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") +endif + +ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) +$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") +endif + +HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) + +ifndef HEXAGON_TOOLS_ROOT +HEXAGON_TOOLS_ROOT = $(DEFAULT_HEXAGON_TOOLS_ROOT) +endif + +$(info "HEXAGON_TOOLS_ROOT is [${HEXAGON_TOOLS_ROOT}]" ) + +ifndef V +V = v75 +endif +$(info "V is [${V}]" ) +ifndef BUILD +BUILD = $(DEFAULT_BUILD) +endif + +$(info "BUILD is [${BUILD}]" ) + +QHL_DIR = $(HEXAGON_SDK_ROOT)/libs/qhl +QHL_HVX_DIR = $(HEXAGON_SDK_ROOT)/libs/qhl_hvx +COMPLETE_TOOLS_VERSION = $(shell basename $(HEXAGON_TOOLS_ROOT)) +TEMP_VAR = $(subst ., ,$(COMPLETE_TOOLS_VERSION)) +TOOLS_VERSION = $(word 1,$(TEMP_VAR))$(word 2,$(TEMP_VAR)) +BUILD_DIR = hexagon_$(BUILD)_toolv$(TOOLS_VERSION)_$(V) +PREBUILT_DIR = hexagon_toolv$(TOOLS_VERSION)_v65 + +$(info "TOOLS_VERSION is [${TOOLS_VERSION}]" ) + + +QHL_DIR_BIN = $(QHL_DIR)/$(BUILD_DIR) +QHL_HVX_DIR_BIN = $(QHL_HVX_DIR)/$(BUILD_DIR) + +QHL_INC_DIRS := $(QHL_DIR)/inc/qhmath $(QHL_DIR)/inc/qhcomplex $(QHL_DIR)/inc/qhdsp $(QHL_DIR)/inc/qhblas +# QHL_LIBS = $(QHL_DIR_BIN)/libqhdsp.a $(QHL_DIR_BIN)/libqhcomplex.a $(QHL_DIR_BIN)/libqhmath.a $(QHL_DIR_BIN)/libqhblas.a + +QHL_HVX_INC_DIRS := $(QHL_HVX_DIR)/inc/internal $(QHL_HVX_DIR)/inc/qhdsp_hvx $(QHL_HVX_DIR)/inc/qhblas_hvx +# QHL_HVX_LIBS = $(QHL_HVX_DIR_BIN)/libqhdsp_hvx.a $(QHL_HVX_DIR_BIN)/libqhblas_hvx.a $(QHL_DIR_BIN)/libqhmath.a $(QHL_DIR_BIN)/libqhcomplex.a + +WORKER_POOL_INC := $(HEXAGON_SDK_ROOT)/libs/worker_pool/inc/ $(HEXAGON_SDK_ROOT)/incs/stddef/ $(HEXAGON_SDK_ROOT)/incs/ +WORKER_POOL_LIB := $(HEXAGON_SDK_ROOT)/libs/worker_pool/prebuilt/$(PREBUILT_DIR)/libworker_pool.a + +$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") +# Users should note that the tools version may change between hexagon sdk versions +# Following combination of SDK and Tool version is supported +HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/HexagonSDK/ +HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/HexagonSDK/ +HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/HexagonSDK/ +HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/HexagonSDK/ +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_BASE)/HexagonSDK/ +HEXAGON_TOOLS_VERSION_V68 := 8.4.09 +HEXAGON_TOOLS_VERSION_V69 := 8.5.03 +HEXAGON_TOOLS_VERSION_V73 := 8.7.06 +HEXAGON_TOOLS_VERSION_V75 := 8.7.06 +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_TOOLS_VERSION_X86 := 8.7.06 + +ifndef ANDROID_NDK_ROOT +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +endif +endif + +ifndef PACKAGE_NAME +export +PACKAGE_NAME := $(notdir $(shell pwd)) +$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") +endif + +WORK := build +SRC_DIR := src +OP_SRC_DIR := src/ops +OP_INCLUDE_DIR := ./include +OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags +LIBRARY_NAME := libQnn$(PACKAGE_NAME).so +SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 aarch64-android + +INCLUDES = $(addprefix -I,$(QHL_INC_DIRS)) $(addprefix -I,$(QHL_HVX_INC_DIRS)) $(addprefix -I,$(WORKER_POOL_INC)) -I$(HEXAGON_SDK_BASE)/HexagonSDK/libs/qhl_hvx/inc/qhmath_hvx/ + + +COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -fno-builtin -Wno-unused-function +COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ +COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" + +X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools + +# Ensure hexagon sdk tool version can be retrieved +ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) +$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ + \ + Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") +endif + +#Check tools for hexagon_v75 are present. +ifeq ($(MAKECMDGOALS),htp_v75) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") +endif +endif + +#Check tools for hexagon_v68 are present. +ifeq ($(MAKECMDGOALS),htp_v68) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v69) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v73) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") +endif +endif + +endif +OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) +OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) +HFILES = $(wildcard $(QNN_INCLUDE)/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) +OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) +OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) + +#======= Assembly ======== +OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) +OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) +OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) +OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) +OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) +OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) +OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) +OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) +OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) +OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) +OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) +OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) + +$(info "ASSEMBLIES : $(OP_SOURCES_ASM_ANDROID), $(OP_SOURCES_ASM_V75), $(OP_SOURCES_ASM_V68)") + +all: htp_v68 htp_x86 htp_aarch64 + +#============================================================================================================ +# Setup compiler, compiler instructions and linker for x86 +X86_CXX ?= clang++ +X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread +X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX +X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof +linux_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for hexagon +HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED +HEXAGON_CXX_FLAGS += -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef + +HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix +HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix +HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix +HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix + +HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ + +HEX_LDFLAGS = +hexagon_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for aarch64 +AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID +AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers +ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ +AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) +AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare +aarch64_objs = +#============================================================================================================ +# Setup targets and goals + +htp_x86: X86_BUILD + +htp_v68: HEXAGON_BUILD_V68 + +htp_v69: HEXAGON_BUILD_V69 + +htp_v73: HEXAGON_BUILD_V73 + +htp_v75: HEXAGON_BUILD_V75 + +htp_aarch64: AARCH64_BUILD + +AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) + +HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) + +HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) + +HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) + +HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) + +X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) + + +define build_objs = +ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) +$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) +else +$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") +endif +endef + +$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) +$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) + +# x86 +$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/aarch64-android: + @mkdir -p $@/ops + +$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -DREFERENCE_OP -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) + $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) + +# v68 +$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) + $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v69 +$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) + $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v73 +$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD $(INCLUDES) -DHVX_OP -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) + $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) $(WORKER_POOL_LIB) + +#v75 +$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD $(INCLUDES) -DHVX_OP -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) + $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) $(WORKER_POOL_LIB) + +# aarch64 +$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -DREFERENCE_OP -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) + $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) + +clean: + -rm -rf $(WORK) + +.PHONY: all clean + diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp new file mode 100644 index 000000000..0b264e151 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp @@ -0,0 +1,400 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/QnnHtpCommon.h" +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "HTP/core/unique_types.h" +#include "QnnOpPackage.h" +#include "QnnSdkBuildId.h" + +DEFINE_UNIQ_TY() +BEGIN_PKG_OPS_OPTS_LIST() + +/** Note that the order of declarations given here defines the order in which + * ops and graph optimizations are registered to the HTP Core. Append the latest + * OpName at the bottom + */ +DECLARE_PKG_OPS_OPTS_LIST(PKG_IRoPE) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMALinear) +DECLARE_PKG_OPS_OPTS_LIST(PKG_SplitInput) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAReLU) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMASuperSiLU) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAQuantize) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAMul) +DECLARE_PKG_OPS_OPTS_LIST(PKG_KVCache) +DECLARE_PKG_OPS_OPTS_LIST(PKG_Attention) +DECLARE_PKG_OPS_OPTS_LIST(PKG_QLayerNorm) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAAdd) +DECLARE_PKG_OPS_OPTS_LIST(PKG_CausalMask) +DECLARE_PKG_OPS_OPTS_LIST(PKG_HeadMatmul) +DECLARE_PKG_OPS_OPTS_LIST(PKG_RoPE) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMADequantize) +DECLARE_PKG_OPS_OPTS_LIST(PKG_WNop) +DECLARE_PKG_OPS_OPTS_LIST(PKG_MergeOutput) +DECLARE_PKG_OPS_OPTS_LIST(PKG_RMSNorm) +DECLARE_PKG_OPS_OPTS_LIST(PKG_SiLU) + +END_PKG_OPS_OPTS_LIST() + +// op package info +static constexpr auto sg_packageName = + THIS_PKG_NAME_STR; // package name passed in as compile flag + +static std::array sg_opNames{ + {"IRoPE", "LLaMALinear", "SplitInput", "LLaMAReLU", "LLaMASuperSiLU", + "LLaMAQuantize", "LLaMAMul", "KVCache", "Attention", "QLayerNorm", + "LLaMAAdd", "CausalMask", "HeadMatmul", "RoPE", "LLaMADequantize", "WNop", + "MergeOutput", "RMSNorm", "SiLU"}}; + +static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; +static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + +// global data +static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = + nullptr; // global infrastructure not in use for now +static bool sg_packageInitialized = false; + +/* + * user provided logging call back function + * currently only supported on linux x86-64 and nonrpc versions + * typedef void (*QnnLog_Callback_t)(const char* fmt, + * QnnLog_Level_t level, + * uint64_t timestamp, + * va_list args); + * usage: if(sg_logInitialized && level <= sg_maxLogLevel) + * sg_logCallback(fmt, level, timestamp, args); + * + * for cross rpc versions, skel side user provided logging call back function + * can be defined as part of op packages. maximal log level sg_maxLogLevel + * can be set by Qnn_ErrorHandle_t LLaMAPackageLogSetLevel(QnnLog_Level_t + * maxLogLevel) + */ +/* + * for alternative logging method provided by HTP core, please refer to log.h + */ +static QnnLog_Callback_t sg_logCallback = + nullptr; // user provided call back function pointer for logging +static QnnLog_Level_t sg_maxLogLevel = + (QnnLog_Level_t)0; // maximal log level used in user provided logging +static bool sg_logInitialized = + false; // tracks whether user provided logging method has been initialized + +/* + * op initialization + * needs to be global in the package + * one initialization per package before any op definitions + * syntax: INIT_PACKAGE_OP_DEF() + */ +INIT_PACKAGE_OP_DEF() + +/* + * optimization initialization + * needs to be global in the package + * one initialization per package before any optimization definitions + * syntax: INIT_PACKAGE_OPTIMIZATION_DEF() + */ +INIT_PACKAGE_OPTIMIZATION_DEF() + +/* + * op parameter order initialization + * needs to be global in the package + * one initialization per package before any op parameter order definitions + * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() + */ +INIT_PACKAGE_PARAM_ORDER_DEF() + +/* + * axis parameter name list + * optional + * needs to be global in the package + * one list per package + * for listing axis parameter names passed into Qnn_AddNode API + * HTP backend auto-adjusts values in axis parameters based on HTP backfilling + * note: HTP backend backfills tensor dimensions to 4 dimensions + * syntax: LIST_PACKAGE_AXIS_PARAMS(...) + * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") + */ +// LIST_PACKAGE_AXIS_PARAMS() + +/* + * per-channel quantized op name list + * optional + * needs to be global in the package + * one list per package + * for listing op names which support per-channel quantization + * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding + * inside Qnn_Tensor_t types + * HTP backend only supports per-channel scale ops + * i.e. along last dimension, offset is always zero + * if an op name is marked as having per-channel scale support, and in + * QNN_AddNode, at least one input, parameter, or output has + * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: + * then: + * HTP backend will pass to op implementation function the following: + * output(s), input(s), parameter(s), + * outputPerChannelScale(s), inputPerChannelScale(s), + * paramPerChannelScale(s) + * + * optimization rules can be used to remove extra perChannelScale tensors + * + * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) + */ + +// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + +/* + * Declare and define the special intialize function for HTP Backend to load + */ +INIT_PKG_CORE_INIT_FUNC() + +/* op package API's */ + +Qnn_ErrorHandle_t +LLaMAPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { + if (sg_packageInitialized) + return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + + /* + * op parameter order registration + * registers all defined op parameter orders in the package + * syntax: REGISTER_PACKAGE_PARAM_ORDERS() + */ + REGISTER_PACKAGE_PARAM_ORDERS() + + /* + * op axis parameter name registration + * registers all axis parameter names in the package + * used with LIST_PACKAGE_AXIS_PARAMS(...) + * syntax: REGISTER_PACKAGE_AXIS_PARAMS() + */ + REGISTER_PACKAGE_AXIS_PARAMS() + + /* + * per-channel scale op name registration + * registers all per-channel scale op names in the package + * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + */ + REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + + sg_globalInfra = infrastructure; + sg_packageInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t LLaMAPackageGetInfo(const QnnOpPackage_Info_t **info) { + if (!sg_packageInitialized) + return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + if (!info) + return QNN_OP_PACKAGE_ERROR_INVALID_INFO; + + sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + sg_packageInfo.packageName = sg_packageName; + sg_packageInfo.operationNames = sg_opNames.data(); + sg_packageInfo.numOperations = sg_opNames.size(); + sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; + sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; + + *info = &sg_packageInfo; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t LLaMAPackageLogInitialize(QnnLog_Callback_t callback, + QnnLog_Level_t maxLogLevel) { + if (sg_logInitialized) + return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + if (!callback) + return QNN_LOG_ERROR_INVALID_ARGUMENT; + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) + return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_logCallback = callback; + sg_maxLogLevel = maxLogLevel; + sg_logInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t LLaMAPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) + return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_maxLogLevel = maxLogLevel; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t LLaMAPackageLogTerminate() { + if (!sg_logInitialized) + return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + sg_logCallback = nullptr; + sg_maxLogLevel = (QnnLog_Level_t)0; + sg_logInitialized = false; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t LLaMAPackageValidateOpConfig(Qnn_OpConfig_t opConfig) { + if (std::string(sg_packageName) != opConfig.v1.packageName) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* auto-generated validation code below + * Check if op config type matches any registered ops + * If a match is found, check number of inputs, outputs and params + */ + if (std::string(opConfig.v1.typeName) == "IRoPE") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMALinear") { + if (opConfig.v1.numOfParams != 4 || opConfig.v1.numOfInputs != 3 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "SplitInput") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 2) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMAReLU") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMASuperSiLU") { + if (opConfig.v1.numOfParams != 3 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMAQuantize") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMAMul") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "KVCache") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "Attention") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 5 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "QLayerNorm") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 3 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMAAdd") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "CausalMask") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "HeadMatmul") { + if (opConfig.v1.numOfParams != 2 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "RoPE") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "LLaMADequantize") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "WNop") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 2) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "MergeOutput") { + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "RMSNorm") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else if (std::string(opConfig.v1.typeName) == "SiLU") { + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || + opConfig.v1.numOfOutputs != 1) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } else { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* + * additional validation code here + * */ + + return QNN_SUCCESS; +} + +/* The following three functions in this comment are not called by HTP backend + *for now, no auto-generated implementations are created. Users should see + *example for full function signatures. (version 1.3.0) Qnn_ErrorHandle_t + *LLaMAPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** + *kernels, uint32_t* numKernels) (version 1.3.0) Qnn_ErrorHandle_t + *LLaMAPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) + * + * (version 1.4.0) Qnn_ErrorHandle_t LLaMAPackageCreateOpImpl + *(QnnOpPackage_GraphInfrastructure_t graphInfrastructure, QnnOpPackage_Node_t + *node, QnnOpPackage_OpImpl_t* opImpl) (version 1.4.0) Qnn_ErrorHandle_t + *LLaMAPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) + */ + +Qnn_ErrorHandle_t LLaMAPackageTerminate() { + if (!sg_packageInitialized) + return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + + sg_globalInfra = nullptr; + sg_packageInitialized = false; + return QNN_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif + +/* latest version */ +Qnn_ErrorHandle_t +LLaMAPackageInterfaceProvider(QnnOpPackage_Interface_t *interface) { + if (!interface) + return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; + interface->interfaceVersion = {1, 4, 0}; + interface->v1_4.init = LLaMAPackageInit; + interface->v1_4.terminate = LLaMAPackageTerminate; + interface->v1_4.getInfo = LLaMAPackageGetInfo; + interface->v1_4.validateOpConfig = LLaMAPackageValidateOpConfig; + interface->v1_4.createOpImpl = nullptr; + interface->v1_4.freeOpImpl = nullptr; + interface->v1_4.logInitialize = LLaMAPackageLogInitialize; + interface->v1_4.logSetLevel = LLaMAPackageLogSetLevel; + interface->v1_4.logTerminate = LLaMAPackageLogTerminate; + return QNN_SUCCESS; +} + +#ifdef __cplusplus +} +#endif diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp new file mode 100644 index 000000000..3b0d84f5d --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp @@ -0,0 +1,146 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +#define MASK_INFINITY 1e15 + +BEGIN_PKG_OP_DEFINITION(PKG_CausalMask); + +// op execute function declarations +template +GraphStatus causalmaskImpl(TensorType &out_0, const TensorType &in_0); + +// forward declaration of sample cost function +static float causalmaskCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((causalmaskImpl), "CausalMask") + */ +DEF_PACKAGE_OP((causalmaskImpl), "CausalMask") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((causalmaskImpl), + * "CausalMask", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((causalmaskImpl), + * "CausalMask", causalmaskCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +template +GraphStatus causalmaskImpl(TensorType &out_0, const TensorType &in_0) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + int old_dim = 0; + + // NHSD + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + // S > 1 => mask + if (w_in > 1) { + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // CausalMask + for (Idx d = 0; d < d_in; d++) { + + float in_value = in_0(b, h, w, d); + + if (d > w + old_dim) + out_0(b, h, w, d) = in_value - MASK_INFINITY; + else + out_0(b, h, w, d) = in_value; + } + } + } + } + } else { + auto in_ptr = in_0.raw_data_const(); + auto out_ptr = out_0.raw_data(); + memcpy(out_ptr, in_ptr, b_in * h_in * w_in * d_in * 4); + } + + return GraphStatus::Success; +} + +__attribute__((unused)) static float causalmaskCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_CausalMask); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp new file mode 100644 index 000000000..eeb83c00f --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp @@ -0,0 +1,164 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_HeadMatmul); + +static Qnn_Scalar_t sg_opDefaultTranspose_In0Scalar = { + .dataType = Qnn_DataType_t::QNN_DATATYPE_BOOL_8, .bool8Value = false}; +static Qnn_Param_t sg_opDefaultTranspose_In0 = { + .paramType = QNN_PARAMTYPE_SCALAR, + .scalarParam = sg_opDefaultTranspose_In0Scalar}; +static Qnn_Scalar_t sg_opDefaultTranspose_In1Scalar = { + .dataType = Qnn_DataType_t::QNN_DATATYPE_BOOL_8, .bool8Value = false}; +static Qnn_Param_t sg_opDefaultTranspose_In1 = { + .paramType = QNN_PARAMTYPE_SCALAR, + .scalarParam = sg_opDefaultTranspose_In1Scalar}; + +// op execute function declarations +template +GraphStatus headmatmulImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, + const QuantUint16Tensor &transpose_in0, + const QuantUint16Tensor &transpose_in1); + +// forward declaration of sample cost function +static float headmatmulCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((headmatmulImpl), "HeadMatmul") + */ +DEF_PACKAGE_OP((headmatmulImpl), "HeadMatmul") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((headmatmulImpl), + * "HeadMatmul", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((headmatmulImpl), + * "HeadMatmul", headmatmulCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ +DEF_PACKAGE_PARAM_ORDER("HeadMatmul", "transpose_in0", false, + &sg_opDefaultTranspose_In0, "transpose_in1", false, + &sg_opDefaultTranspose_In1) + +/* execute functions for ops */ + +template +GraphStatus headmatmulImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, + const QuantUint16Tensor &transpose_in0, + const QuantUint16Tensor &transpose_in1) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + auto transpose_in0_ = transpose_in0(0, 0, 0, 0); + auto transpose_in1_ = transpose_in1(0, 0, 0, 0); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + auto [b_in2, h_in2, w_in2, d_in2] = in_1.dims(); + + if (transpose_in0_ && transpose_in1_) { + + // Q KT head matmul + const size_t dims[] = {b_in, w_in, h_in, h_in}; + out_0.set_dims(dims); + debuglog("HeadMatmul execute... dims=(%zdx%zdx%zdx%zd)", out_0.dim(0), + out_0.dim(1), out_0.dim(2), out_0.dim(3)); + + } else if (transpose_in0_) { + + } else if (transpose_in1_) { + + // QKT V head matmul + const size_t dims[] = {b_in, w_in, h_in, d_in2}; + out_0.set_dims(dims); + debuglog("HeadMatmul execute... dims=(%zdx%zdx%zdx%zd)", out_0.dim(0), + out_0.dim(1), out_0.dim(2), out_0.dim(3)); + + // Todo out matrix needs transpose, we directly calculate the final + // dimensions. + + } else { + } + + return GraphStatus::Success; +} + +__attribute__((unused)) static float headmatmulCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_HeadMatmul); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp new file mode 100644 index 000000000..6407a0e39 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp @@ -0,0 +1,300 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_KVCache); + +// op execute function declarations +template +GraphStatus kvcacheImpl(TensorType &out_0, const TensorType &in_0, + const TensorType1 &seq_pos, const Tensor &hidden_dim); + +// forward declaration of sample cost function +static float kvcacheCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((kvcacheImpl), "KVCache") + */ +DEF_PACKAGE_OP((kvcacheImpl), "KVCache") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((kvcacheImpl), "KVCache", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((kvcacheImpl), "KVCache", kvcacheCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ +DEF_PACKAGE_PARAM_ORDER("KVCache", "hidden_dim", true, nullptr) + +/* execute functions for ops */ + +// #ifndef REFERENCE_OP + +// #include "qhmath_hvx.h" +// #include "hvx_internal.h" +// #include +// #include + +// #define BLOCK_SIZE (8*1024/VLEN) /* vector chunks */ +// #define L2FETCH_AHEAD (BLOCK_SIZE) +// #define ONE 0x3F800000 +// #define M_ONE 0xAF800000 + +// int32_t hvx_memcpy_af(float *restrict input, float *restrict output, uint32_t +// size) +// { +// HVX_Vector *input_v_ptr; +// HVX_UVector *output_v_ptr; +// HVX_Vector slinep; +// HVX_Vector slinec; +// HVX_Vector sline; +// int32_t block, l2fetch_block; +// int32_t leftover = size & 31; +// int32_t vectors_in_rounddown = size / 32; +// int32_t leftover_size = leftover * sizeof(float); + +// /* Check input arguments. Return error status if some argument has +// invalid value */ if ((input == 0) || (output == 0) || (size == 0)) +// { +// return -1; +// } + +// input_v_ptr = (HVX_Vector *) input; +// output_v_ptr = (HVX_UVector *) output; + +// /* +// * If input data is not aligned to HVX vector size, compose aligned +// vectors +// * from data loaded in slinep and slinec +// */ +// slinep = *input_v_ptr++; + +// /* +// * Handle number of whole vectors in input data. +// * Don't process last vector in order to avoid out-of-boundary load. +// */ +// for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) +// { +// block = Q6_R_min_RR(i, BLOCK_SIZE); +// l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + +// if (l2fetch_block > 0) +// { +// l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, +// 0); +// } + +// /* Process one vector at a time */ +// for (int32_t j = 0; j < block; ++j) +// { +// slinec = *input_v_ptr++; + +// /* Compose vector of input data from slinec and slinep */ +// sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + +// /* Store results to the output buffer and convert from qf32 to sf +// */ +// *((HVX_UVector *)(output_v_ptr++)) = sline; + +// /* Prepare slinep for next iteration */ +// slinep = slinec; +// } +// } + +// /* Handle last whole vector from input data */ +// if (vectors_in_rounddown > 0) +// { +// slinec = is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : +// *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) +// input); + +// /* Convert from qf32 to sf, store output and go to handle leftover */ +// *((HVX_UVector *)(output_v_ptr++)) = sline; + +// slinep = slinec; +// } + +// /* Handle leftover elements */ +// if (leftover > 0) +// { +// slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) +// ? slinep +// : *input_v_ptr++); + +// sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + +// /* Store output */ +// vstu_variable(output_v_ptr, leftover_size, sline); +// } + +// return 0; +// } + +// template +// GraphStatus kvcacheImpl(TensorType& out_0, +// const TensorType& in_0, +// const TensorType1 &seq_pos, +// const Tensor& hidden_dim) + +// { +// /* +// * add code here +// * */ +// /* +// * To have good performance and stability, it is required to avoid heap +// memory +// * allocation in this function. The heap memory allocation includes but not +// * limited to calling malloc, operator new, constructing STL container +// objects +// * like std::vector with default allocator, and adding items like calling +// * std::vector::push_back to STL container objects with default allocator. +// * +// * Please check in SDK documentation for more information. +// */ + +// out_0.set_dims(in_0); +// auto [b_in, h_in, w_in, d_in] = in_0.dims(); + +// uint32_t seq_pos_ = seq_pos(0,0,0,0); +// // uint32_t hidden_dim_ = hidden_dim(0,0,0,0); + +// // // const size_t dims[] = {b_in, h_in, seq_pos_+1, hidden_dim_}; +// // // out_0.set_dims(dims); + +// // NSHD + +// auto in_ptr = (float*)in_0.raw_data_const(); +// auto out_ptr = (float*)out_0.raw_data(); + +// out_ptr += seq_pos_ * h_in * w_in * d_in; + +// hvx_memcpy_af(out_ptr, in_ptr, h_in * w_in * d_in); + +// return GraphStatus::Success; +// } + +// #else + +template +GraphStatus kvcacheImpl(TensorType &out_0, const TensorType &in_0, + const TensorType1 &seq_pos, const Tensor &hidden_dim) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + uint32_t seq_pos_ = seq_pos(0, 0, 0, 0); + const size_t dims[] = {b_in, h_in + seq_pos_, w_in, d_in}; + + out_0.set_dims(dims); + + // uint32_t hidden_dim_ = hidden_dim(0,0,0,0); + + // // const size_t dims[] = {b_in, h_in, seq_pos_+1, hidden_dim_}; + // // out_0.set_dims(dims); + + // NSHD + + DType dtype = in_0.get_dtype(); + + const uint8_t *in_ptr = (uint8_t *)in_0.raw_data_const(); + uint8_t *out_ptr = (uint8_t *)out_0.raw_data(); + + if (dtype == DType::QUInt8) { + + out_ptr += seq_pos_ * w_in * d_in; + memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(uint8_t)); + + } else if (dtype == DType::Float16) { + + out_ptr += seq_pos_ * w_in * d_in * sizeof(float) / 2; + memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(float) / 2); + } else if (dtype == DType::Float32) { + + out_ptr += seq_pos_ * w_in * d_in * sizeof(float); + memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(float)); + } + + return GraphStatus::Success; +} + +// #endif + +__attribute__((unused)) static float kvcacheCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_KVCache); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp new file mode 100644 index 000000000..ee8491416 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp @@ -0,0 +1,254 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMAAdd); + +// op execute function declarations +template +GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1); + +// forward declaration of sample cost function +static float llamaaddCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((llamaaddImpl), "LLaMAAdd") + */ +DEF_PACKAGE_OP((llamaaddImpl), "LLaMAAdd") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamaaddImpl), + * "LLaMAAdd", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamaaddImpl), + * "LLaMAAdd", llamaaddCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +int32_t hvx_add_af(float *restrict input, float *restrict input2, + float *restrict output, uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)input2; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + // HVX_Vector v128 = Q6_Vb_vsplat_R(0x80808080u); + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + sline2p = *iptr2++; + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + // Our add consider uint8->int8 bugs from QNN. + // sline2 = Q6_Vb_vsub_VbVb(sline2, v128); + *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2)); + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + // sline2 = Q6_Vb_vsub_VbVb(sline2, v128); + *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2)); + } + + // Handle leftover elements. + if (leftover_size > 0) { + sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++); + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = + (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + // sline2 = Q6_Vb_vsub_VbVb(sline2, v128); + vstu_variable(optr, leftover_size, + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2))); + } + + return 0; +} + +template +GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + out_0.set_dims(in_0); + + auto in_ptr = (float *)in_0.raw_data_const(); + auto in2_ptr = (float *)in_1.raw_data_const(); + auto out_ptr = (float *)out_0.raw_data(); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + size_t size = b_in * h_in * w_in * d_in; + + hvx_add_af(in_ptr, in2_ptr, out_ptr, size); + + return GraphStatus::Success; +} + +#else + +template +GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // mul + for (Idx d = 0; d < d_in; d++) { + float inval = in_0(b, h, w, d); + float inval2 = in_1(b, h, w, d); + float outval = inval + inval2; + + out_0(b, h, w, d) = outval; + } + } + } + } + + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float llamaaddCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMAAdd); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp new file mode 100644 index 000000000..ba9ba5d95 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp @@ -0,0 +1,209 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMALinear); + +// op execute function declarations +template +GraphStatus llamalinearImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, const TensorType &in_2, + const PlainFloatTensor &in_scale, + const PlainFloatTensor &weight_scale, + const PlainFloatTensor &bias_scale, + const PlainFloatTensor &output_scale); + +// forward declaration of sample cost function +static float llamalinearCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((llamalinearImpl), "LLaMALinear") + */ +DEF_PACKAGE_OP((llamalinearImpl), "LLaMALinear") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamalinearImpl), + * "LLaMALinear", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamalinearImpl), + * "LLaMALinear", llamalinearCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ +DEF_PACKAGE_PARAM_ORDER("LLaMALinear", "in_scale", true, nullptr, + "weight_scale", true, nullptr, "bias_scale", true, + nullptr, "output_scale", true, nullptr) + +/* execute functions for ops */ + +float Round(float num) { + float floor_num = floor(num); + float ceil_num = ceil(num); + + if (num - floor_num < ceil_num - num) { + return floor_num; + } else { + return ceil_num; + } +} + +template +GraphStatus llamalinearImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, const TensorType &in_2, + const PlainFloatTensor &in_scale, + const PlainFloatTensor &weight_scale, + const PlainFloatTensor &bias_scale, + const PlainFloatTensor &output_scale) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + // 假设输入张量是4维的,NHWC格式 + int batch_size = in_0.dims()[0]; + int height = in_0.dims()[1]; + int width = in_0.dims()[2]; + int in_features = in_0.dims()[3]; // 输入的通道数 + int out_features = in_1.dims()[3]; // 输出的特征数(即输出通道数) + + // 检查输入张量的形状是否匹配 + if (in_1.dims()[0] != 1 || in_1.dims()[1] != 1 || + in_1.dims()[2] != in_features || in_2.dims()[3] != out_features) { + return GraphStatus::ErrorFatal; + } + + // 获取量化比例 + float w_scale = weight_scale(0, 0, 0, 0); + float i_scale = in_scale(0, 0, 0, 0); + float b_scale = bias_scale(0, 0, 0, 0); + float o_scale = output_scale(0, 0, 0, 0); + + // 初始化输出张量 + + size_t dims[] = {static_cast(batch_size), static_cast(height), + static_cast(width), + static_cast(out_features)}; + out_0.set_dims(dims); + + // only support float bias now. + auto in0_ptr = (uint8_t *)in_0.raw_data_const(); + auto in1_ptr = (uint8_t *)in_1.raw_data_const(); + auto in2_ptr = (uint8_t *)in_2.raw_data_const(); + auto out_ptr = (int8_t *)out_0.raw_data(); + + // 进行量化Linear乘法 + for (int b = 0; b < batch_size; ++b) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int n = 0; n < out_features; ++n) { + float acc = 0; + for (int k = 0; k < in_features; ++k) { + int in_index = b * height * width * in_features + + h * width * in_features + w * in_features + k; + int weight_index = k * out_features + n; + acc += + ((static_cast(in0_ptr[in_index]) - 128) * i_scale) * + ((static_cast(in1_ptr[weight_index]) - 128) * w_scale); + } + // 加上偏置并进行反量化 + float result = acc; + result += (static_cast(in2_ptr[n]) - 128) * b_scale; + // 将结果限制在uint8范围内 + int out_index = b * height * width * out_features + + h * width * out_features + w * out_features + n; + + result = Round(result / o_scale); + + long v = lroundf(result); + + if (v > 127) + v = 127; + + if (v < -128) + v = -128; + + if (out_0.get_dtype() == DType::QUInt8) + v += 128; + + out_ptr[out_index] = static_cast(v); + } + } + } + } + + return GraphStatus::Success; +} + +__attribute__((unused)) static float llamalinearCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMALinear); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp new file mode 100644 index 000000000..3deef796c --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp @@ -0,0 +1,344 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMAMul); + +// op execute function declarations +template +GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1); + +// forward declaration of sample cost function +static float llamamulCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((llamamulImpl), "LLaMAMul") + */ +DEF_PACKAGE_OP((llamamulImpl), "LLaMAMul") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamamulImpl), + * "LLaMAMul", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamamulImpl), + * "LLaMAMul", llamamulCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +int32_t hvx_mul_af(float *restrict input, float *restrict input2, + float *restrict output, uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)input2; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + sline2p = *iptr2++; + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2)); + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2)); + } + + // Handle leftover elements. + if (leftover_size > 0) { + sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++); + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = + (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + vstu_variable(optr, leftover_size, + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2))); + } + + return 0; +} + +int32_t hvx_mul_ahf(__fp16 *restrict input, __fp16 *restrict input2, + __fp16 *restrict output, uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)input2; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 64; + int32_t leftover_size = leftover * sizeof(__fp16); + + sline1p = *iptr++; + sline2p = *iptr2++; + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + *optr++ = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2)); + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + *optr++ = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2)); + } + + // Handle leftover elements. + if (leftover_size > 0) { + sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++); + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = + (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + vstu_variable(optr, leftover_size, + Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2))); + } + + return 0; +} + +template +GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + size_t size = b_in * h_in * w_in * d_in; + + DType dtype = in_0.get_dtype(); + + if (dtype == DType::Float16) { + auto in_ptr = (__fp16 *)in_0.raw_data_const(); + auto in2_ptr = (__fp16 *)in_1.raw_data_const(); + auto out_ptr = (__fp16 *)out_0.raw_data(); + + hvx_mul_ahf(in_ptr, in2_ptr, out_ptr, size); + + } else { + auto in_ptr = (float *)in_0.raw_data_const(); + auto in2_ptr = (float *)in_1.raw_data_const(); + auto out_ptr = (float *)out_0.raw_data(); + + hvx_mul_af(in_ptr, in2_ptr, out_ptr, size); + } + + return GraphStatus::Success; +} + +#else + +template +GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + DType dtype = in_0.get_dtype(); + + auto out_ptr = (__fp16 *)out_0.raw_data(); + auto in_ptr = (__fp16 *)in_0.raw_data_const(); + auto in_ptr2 = (__fp16 *)in_1.raw_data_const(); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // mul + for (Idx d = 0; d < d_in; d++) { + + if (dtype == DType::Float16) { + + __fp16 inval = *in_ptr++; + __fp16 inval2 = *in_ptr2++; + __fp16 outval = inval * inval2; + + *out_ptr++ = outval; + } + + if (dtype == DType::Float32) { + float inval = in_0(b, h, w, d); + float inval2 = in_1(b, h, w, d); + float outval = inval * inval2; + + out_0(b, h, w, d) = outval; + } + } + } + } + } + + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float llamamulCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMAMul); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp new file mode 100644 index 000000000..9e7d48465 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp @@ -0,0 +1,297 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMAReLU); + +// op execute function declarations +template +GraphStatus llamareluImpl(TensorType &out_0, const TensorType &in_0); + +// forward declaration of sample cost function +static float llamareluCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((llamareluImpl), "LLaMAReLU") + */ +DEF_PACKAGE_OP((llamareluImpl), "LLaMAReLU") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamareluImpl), + * "LLaMAReLU", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamareluImpl), + * "LLaMAReLU", llamareluCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +// #ifndef REFERENCE_OP + +// #include "qhmath_hvx.h" +// #include "hvx_internal.h" +// #include +// #include + +// #define BLOCK_SIZE (8*1024/VLEN) /* vector chunks */ +// #define L2FETCH_AHEAD (BLOCK_SIZE) +// #define ONE 0x3F800000 +// #define M_ONE 0xAF800000 + +// int32_t hvx_relu_au8(uint8_t *restrict input, uint8_t *restrict output, +// uint32_t size) +// { +// HVX_Vector *input_v_ptr; +// HVX_UVector *output_v_ptr; +// HVX_Vector slinep; +// HVX_Vector slinec; +// HVX_Vector sline; +// int32_t block, l2fetch_block; +// int32_t leftover = size & 128; +// int32_t vectors_in_rounddown = size / 128; +// int32_t leftover_size = leftover * sizeof(uint8_t); + +// /* Check input arguments. Return error status if some argument has +// invalid value */ if ((input == 0) || (output == 0) || (size == 0)) +// { +// return -1; +// } + +// input_v_ptr = (HVX_Vector *) input; +// output_v_ptr = (HVX_UVector *) output; + +// HVX_Vector vO = Q6_Vb_vsplat_R(0x80808080u); + +// /* +// * If input data is not aligned to HVX vector size, compose aligned +// vectors +// * from data loaded in slinep and slinec +// */ +// slinep = *input_v_ptr++; + +// /* +// * Handle number of whole vectors in input data. +// * Don't process last vector in order to avoid out-of-boundary load. +// */ +// for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) +// { +// block = Q6_R_min_RR(i, BLOCK_SIZE); +// l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + +// if (l2fetch_block > 0) +// { +// l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, +// 0); +// } + +// /* Process one vector at a time */ +// for (int32_t j = 0; j < block; ++j) +// { +// slinec = *input_v_ptr++; + +// /* Compose vector of input data from slinec and slinep */ +// sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + +// /* Store results to the output buffer and convert from qf32 to sf +// */ +// *((HVX_UVector *)(output_v_ptr++)) = Q6_Vub_vmax_VubVub(vO, +// sline); + +// /* Prepare slinep for next iteration */ +// slinep = slinec; +// } +// } + +// /* Handle last whole vector from input data */ +// if (vectors_in_rounddown > 0) +// { +// slinec = is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : +// *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t) +// input); + +// /* Convert from qf32 to sf, store output and go to handle leftover */ +// *((HVX_UVector *)(output_v_ptr++)) = Q6_Vub_vmax_VubVub(vO, sline); + +// slinep = slinec; +// } + +// /* Handle leftover elements */ +// if (leftover > 0) +// { +// slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) +// ? slinep +// : *input_v_ptr++); + +// sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input); + +// /* Store output */ +// vstu_variable(output_v_ptr, leftover_size, Q6_Vub_vmax_VubVub(vO, +// sline)); +// } + +// return 0; +// } + +// template +// GraphStatus llamareluImpl(TensorType& out_0, +// const TensorType& in_0) + +// { +// /* +// * add code here +// * */ +// /* +// * To have good performance and stability, it is required to avoid heap +// memory +// * allocation in this function. The heap memory allocation includes but not +// * limited to calling malloc, operator new, constructing STL container +// objects +// * like std::vector with default allocator, and adding items like calling +// * std::vector::push_back to STL container objects with default allocator. +// * +// * Please check in SDK documentation for more information. +// */ + +// out_0.set_dims(in_0); + +// const auto [bIn, hIn, wIn, dIn] = in_0.dims(); + +// auto in_ptr = (uint8_t*)in_0.raw_data_const(); +// auto out_ptr = (uint8_t*)out_0.raw_data(); + +// hvx_relu_au8(out_ptr, in_ptr, bIn * hIn * wIn * dIn * sizeof (uint8_t)); + +// return GraphStatus::Success; +// } +// #else +template +GraphStatus llamareluImpl(TensorType &out_0, const TensorType &in_0) + +{ + out_0.set_dims(in_0); + // NHWC + + if (in_0.get_dtype() == DType::QUInt8) { + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // SiLU + for (Idx d = 0; d < d_in; d++) { + uint8_t inval = in_0(b, h, w, d); + if (inval < 0) + inval = 0; + + out_0(b, h, w, d) = inval; + } + } + } + } + } else if (in_0.get_dtype() == DType::Float16) { + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + auto out_ptr = (__fp16 *)out_0.raw_data(); + auto in_ptr = (__fp16 *)in_0.raw_data_const(); + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + + for (Idx d = 0; d < d_in; d++) { + __fp16 inval = *in_ptr++; + if (inval < 0) + inval = 0; + + *out_ptr++ = inval; + } + } + } + } + } else if (in_0.get_dtype() == DType::Float32) { + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + for (Idx d = 0; d < d_in; d++) { + float inval = in_0(b, h, w, d); + if (inval < 0) + inval = 0; + + out_0(b, h, w, d) = inval; + } + } + } + } + } + + return GraphStatus::Success; +} + +// #endif + +__attribute__((unused)) static float llamareluCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMAReLU); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp new file mode 100644 index 000000000..7e27c5061 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp @@ -0,0 +1,1368 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU); + +// op execute function declarations +template +GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, + const PlainFloatTensor &a_scale, + const PlainFloatTensor &b_scale, + const PlainFloatTensor &o_scale); + +// forward declaration of sample cost function +static float llamasupersiluCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((llamasupersiluImpl), "LLaMASuperSiLU") + */ +DEF_PACKAGE_OP((llamasupersiluImpl), "LLaMASuperSiLU") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamasupersiluImpl), + * "LLaMASuperSiLU", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. + * DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamasupersiluImpl), + * "LLaMASuperSiLU", llamasupersiluCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ +DEF_PACKAGE_PARAM_ORDER("LLaMASuperSiLU", "a_scale", true, nullptr, "b_scale", + true, nullptr, "o_scale", true, nullptr) + +/* execute functions for ops */ + +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +#define FP16_MANTISA 10 +#define FP16_EXPONENT_MASK 0x1f +#define FP16_EXPONENT_BIAS 0xf +#define FP16_MANTISA_MASK 0x000003ff +#define FP16_SIGN 15 +#define FP16_NEG_1 0xbc00 +#define ROUND_2_SCALE 22 +#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f) + +static inline int32_t float_to_fp16s(float input) { + union { + int32_t i; + __fp16 f[2]; + } fp32 = {.f = {(__fp16)input, (__fp16)input}}; + return fp32.i; +} + +static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) { + union { + float f; + uint32_t i; + } fp32 = {.f = x}; + return fp32.i; +} + +static const float fp16_c0_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.13239719960243818, + 0.2216255210749415, + 0.3447664743728659, + 0.48137452032585476, + 0.5716299228719798, + 0.5547323231605259, + 0.5046287748870234, + 0.4999985574626892, + 0.5000036514755082, + 0.49475652448004626, + 0.4441393352532763, + 0.428500379952032, + 0.5173297285470642, + 0.6541461039833616, + 0.7783931007462818, + 0.8678015179911097, +}; +static const float fp16_c1_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.05928005756790343, + 0.11063222460270064, + 0.1932879057003057, + 0.30302440212086995, + 0.3922924462181049, + 0.36546332659415875, + 0.2644148210990377, + 0.24989020912329707, + 0.2498532691910313, + 0.2661055781198988, + 0.36728015359480604, + 0.39215270010450015, + 0.3041825601732039, + 0.1940762094668647, + 0.11061794856987572, + 0.059174800917353595, +}; +static const float fp16_c2_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.010145494303219278, + 0.02123968384425681, + 0.04207468332514667, + 0.07519946712591977, + 0.10840620196267145, + 0.09270738184406795, + 0.015322371881818012, + -0.0009948273994921822, + 0.0011544907060402412, + -0.017040517565094934, + -0.09379878876657094, + -0.10835043868732394, + -0.07558705272699548, + -0.04228875316413285, + -0.021235740718738055, + -0.010124599879590107, +}; +static const float fp16_c3_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0007841223015974933, + 0.001850453397354219, + 0.004187899308371771, + 0.008640952434084206, + 0.01414741414964877, + 0.010117749275618, + -0.01654848996354919, + -0.02395108399453624, + -0.024199111971064446, + -0.015783556879607072, + 0.010407672131558174, + 0.014137608186323335, + 0.008698510795258909, + 0.004213708431213342, + 0.0018499827774393985, + 0.0007822799742289481, +}; +static const float fp16_c4_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.3031641204975905e-05, + 6.150442488966733e-05, + 0.00015997783736818624, + 0.00038491646239693526, + 0.0007283649599237781, + 0.00034439150914392054, + -0.003142246198646662, + -0.004120389580321761, + 0.004246050162553198, + 0.0030162727520777893, + -0.00037312974308425725, + -0.0007277242855014247, + -0.00038811687679772674, + -0.0001611434776868886, + -6.14837984586862e-05, + -2.297076123375133e-05, +}; + +int32_t hvx_supersilu_ahf(uint8_t *restrict input, uint8_t *restrict input2, + uint8_t *restrict output, float a_scale, + float b_scale, float o_scale, uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)input2; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + int32_t block, l2fetch_block; + int32_t leftover = size & 128; + int32_t vectors_in_rounddown = size / 128; + // int32_t leftover_size = leftover * sizeof(__fp16); + + sline1p = *iptr++; + sline2p = *iptr2++; + + // dequantize + uint32_t convert = 0x00800080; + HVX_Vector convert_vector = Q6_V_vsplat_R(convert); + + HVX_Vector a_scale_vec = Q6_V_vsplat_R(float_to_fp16s(a_scale)); + HVX_Vector b_scale_vec = Q6_V_vsplat_R(float_to_fp16s(b_scale)); + HVX_Vector zero_v_sf = Q6_V_vzero(); + + // silu + HVX_Vector input_min_v_hf; + HVX_Vector input_shifted_v_hf; + HVX_Vector input_scaled_v; + HVX_VectorPair input_vp_qf32; + // HVX_Vector input_v_qf16; + HVX_Vector mask_idx1_v, mask_idx2_v; + HVX_Vector const16_0_v_hf; + HVX_Vector zero_v_hf, one_v_hf; + HVX_Vector tmp_v; + HVX_Vector idx1_v, idx2_v; + HVX_Vector scale_v; + HVX_DV output_dv; + HVX_DV c0_coeff_dv; + HVX_VectorPair c0_coeff_vp; + HVX_Vector c0_coeff_v; + HVX_DV c1_coeff_dv; + HVX_VectorPair c1_coeff_vp; + HVX_Vector c1_coeff_v; + HVX_DV c2_coeff_dv; + HVX_VectorPair c2_coeff_vp; + HVX_Vector c2_coeff_v; + HVX_DV c3_coeff_dv; + HVX_VectorPair c3_coeff_vp; + HVX_Vector c3_coeff_v; + HVX_DV c4_coeff_dv; + HVX_VectorPair c4_coeff_vp; + HVX_Vector c4_coeff_v; + + scale_v = Q6_Vh_vsplat_R(0x3bfe); + + /* Vector of ones used as mpy neutral element in conversions from hf vector to + * qf32 vector pair */ + one_v_hf = Q6_Vh_vsplat_R(0x3c00); + + /* + * Vector of zeroes used as neutral element in hf to qf16 conversions. + * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) + * can be avoided in real-time, but this is not done in order to don't + * sacrify code readibility in expense of insignificant performance + * improvement. + */ + zero_v_hf = Q6_V_vzero(); + + /* Mask for extracting only 4 bits of mantissa */ + mask_idx1_v = Q6_Vh_vsplat_R(0x000F); + + mask_idx2_v = Q6_V_vsplat_R(0x00001010); + + /* 16.0 in IEEE 16-bit floating-point representation */ + const16_0_v_hf = Q6_Vh_vsplat_R(0x4c00); + + /* + * Prepare vector of input_min values, that is used later in shifting input + * range. input_min is low boundary of specified input range. + */ + input_min_v_hf = Q6_Vh_vsplat_R(0xc800); + + /* Convert scale factor from hf to q16. Use the same vector for both formats + */ + scale_v = Q6_Vqf16_vadd_VhfVhf(scale_v, zero_v_hf); + + /* Load coefficients */ + c0_coeff_v = *((HVX_Vector *)(fp16_c0_coeffs)); + c1_coeff_v = *((HVX_Vector *)(fp16_c1_coeffs)); + c2_coeff_v = *((HVX_Vector *)(fp16_c2_coeffs)); + c3_coeff_v = *((HVX_Vector *)(fp16_c3_coeffs)); + c4_coeff_v = *((HVX_Vector *)(fp16_c4_coeffs)); + + /* Convert coefficients from hf to qf32 format. Use the same vector for both + * representations */ + c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_hf); + c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_hf); + c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_hf); + c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_hf); + c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_hf); + + /* Split 32-bit coefficients to lower and upper part in order to obtain them + * later with VLUT16. */ + c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); + c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); + c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); + c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); + c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); + + // quantize + HVX_Vector low_level_vec, high_level_vec, o_scale_vec, es_vec, + round_scale_vec; + HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + HVX_Vector vmb = Q6_V_vsplat_R(0x40004000); + + float post_scale_flt = a_scale * b_scale * o_scale; + int scexp = flt_getexp(post_scale_flt); + int rsh = min_i32(-scexp, 7); // e.g. 0.11 -> 0.88, rsh = 3 + float rsh_fac = flt_power2(rsh); + + int adj_bias = roundf_i32(128 * rsh_fac); + adj_bias = Q6_R_combine_RlRl(adj_bias, adj_bias); + + HVX_Vector vadj = Q6_V_vsplat_R(adj_bias); + + float es = 0.5; + low_level_vec = Q6_V_vsplat_R(float_to_fp16s(-128.0f)); + high_level_vec = Q6_V_vsplat_R(float_to_fp16s(127.0f)); + o_scale_vec = + Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1 << 15))); + // one_vec = Q6_V_vsplat_R(float_to_fp16s(1.0f)); + // o_scale_vec = Q6_Vqf16_vadd_VhfVhf(o_scale_vec, zero_v_hf); + es_vec = Q6_V_vsplat_R(float_to_fp16s(es)); + round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf); + round_scale_vec = Q6_Vqf32_vadd_VsfVsf(round_scale_vec, zero_v_sf); + + HVX_Vector expmask = Q6_Vh_vsplat_R(FP16_EXPONENT_MASK); + HVX_Vector expbias = Q6_Vh_vsplat_R(FP16_EXPONENT_BIAS); + HVX_Vector manmask = Q6_Vh_vsplat_R(FP16_MANTISA_MASK); + HVX_Vector exp23 = Q6_Vh_vsplat_R(23 - 1); + HVX_Vector exp0 = Q6_Vh_vsplat_R(0 - 1); + HVX_Vector negone = Q6_Vh_vsplat_R(FP16_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + HVX_Vector sline1_high; + HVX_Vector sline1_low; + // HVX_Vector sline2_high; + // HVX_Vector sline2_low; + + { + // dequantize sline1 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec); + sline1_low = Q6_Vhf_equals_Vqf16(sline1_low); + sline1_high = + Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec); + sline1_high = Q6_Vhf_equals_Vqf16(sline1_high); + } + + // { + // // dequantize sline2 qf16 + // HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf); + + // temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + // HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + // HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + // sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), + // b_scale_vec); sline2_low = Q6_Vhf_equals_Vqf16(sline2_low); + // sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), + // b_scale_vec); sline2_high = Q6_Vhf_equals_Vqf16(sline2_high); + // } + + { + // silu sline1_low + tmp_v = Q6_Vh_vdeal_Vh(sline1_low); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry + * information about integer index. Use the same input_scaled_v vector + * for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's + * method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), + // output_dv.V.lo); output_dv.V.hi = + // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + { + // silu sline1_high + tmp_v = Q6_Vh_vdeal_Vh(sline1_high); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry + * information about integer index. Use the same input_scaled_v vector + * for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's + * method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), + // output_dv.V.lo); output_dv.V.hi = + // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + HVX_Vector sline_high; + HVX_Vector sline_low; + + // { + // // mul + // sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high); + // sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low); + + // sline_high = Q6_Vhf_equals_Vqf16(sline_high); + // sline_low = Q6_Vhf_equals_Vqf16(sline_low); + // } + + HVX_VectorPair mul_output; + { + // uint8 mul + // (a-128)*(b-128) = a*b - 128 (a+b) + 128*128 + HVX_VectorPair prod1 = + Q6_Wuh_vmpyacc_WuhVubVub(Q6_W_vcombine_VV(vmb, vmb), sline1, sline2); + HVX_VectorPair prod2 = + Q6_Wh_vmpa_WubRub(Q6_W_vcombine_VV(sline2, sline1), 0x80808080); + mul_output = Q6_Wh_vsub_WhWh(prod1, prod2); + + mul_output = + Q6_W_vshuff_VVR(Q6_V_hi_W(mul_output), Q6_V_lo_W(mul_output), -2); + + // sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, + // Q6_Vhf_equals_Vh(Q6_V_lo_W(mul_output))); sline_high = + // Q6_Vqf16_vmpy_VhfVhf(sline1_high, + // Q6_Vhf_equals_Vh(Q6_V_hi_W(mul_output))); + } + + { + // scaling quantize + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, o_scale_vec); + sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low)); + sline_low = Q6_Vh_vadd_VhVh_sat( + Q6_Vh_vmpy_VhVh_s1_rnd_sat(Q6_V_lo_W(mul_output), sline_low), vadj); + + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, o_scale_vec); + sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high)); + sline_high = Q6_Vh_vadd_VhVh_sat( + Q6_Vh_vmpy_VhVh_s1_rnd_sat(sline_high, Q6_V_hi_W(mul_output)), vadj); + + HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat(sline_high, sline_low, rsh); + sout = Q6_Vb_vdeal_Vb(sout); + *optr++ = sout; + } + + // { + // // quantize + // HVX_Vector sout1 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_low, o_scale_vec); + // sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec); + // sout1 = Q6_Vhf_equals_Vqf16(sout1); + // sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec); + // sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec); + // HVX_VectorPair sout1_pair = Q6_Wqf32_vmpy_VhfVhf(sout1, one_vec); + // HVX_Vector sout1_low = Q6_Vsf_equals_Vqf32( + // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout1_pair), round_scale_vec)); + // HVX_Vector sout1_high = + // Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout1_pair), + // round_scale_vec)); + + // sout1_pair = Q6_W_vshuff_VVR(sout1_high, sout1_low, -4); + // sout1_low = Q6_V_lo_W(sout1_pair); + // sout1_high = Q6_V_hi_W(sout1_pair); + + // // { + // // HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA); + // // exp = Q6_V_vand_VV(exp, expmask); + // // exp = Q6_Vh_vsub_VhVh(exp, expbias); + + // // HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + // // HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + // // HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN); + // // HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + // // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + // // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + // // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + // // HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man); + // // man = Q6_V_vnot_V(man); + // // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + // // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // // HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1); + // // HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // // // exp >= 0 + // // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); + // // tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // // // exp < 0 (-1, 1) + // // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + // // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // // sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + // // } + + // sout1_low = Q6_Vw_equals_Vsf(sout1_low); + // sout1_low = Q6_Vw_vasr_VwR(sout1_low, ROUND_2_SCALE); + // sout1_high = Q6_Vw_equals_Vsf(sout1_high); + // sout1_high = Q6_Vw_vasr_VwR(sout1_high, ROUND_2_SCALE); + + // HVX_Vector sout2 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_high, o_scale_vec); + // sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec); + // sout2 = Q6_Vhf_equals_Vqf16(sout2); + // sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec); + // sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec); + // HVX_VectorPair sout2_pair = Q6_Wqf32_vmpy_VhfVhf(sout2, one_vec); + // HVX_Vector sout2_low = Q6_Vsf_equals_Vqf32( + // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout2_pair), round_scale_vec)); + // HVX_Vector sout2_high = + // Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout2_pair), + // round_scale_vec)); + + // sout2_pair = Q6_W_vshuff_VVR(sout2_high, sout2_low, -4); + // sout2_low = Q6_V_lo_W(sout2_pair); + // sout2_high = Q6_V_hi_W(sout2_pair); + + // // { + // // HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA); + // // exp = Q6_V_vand_VV(exp, expmask); + // // exp = Q6_Vh_vsub_VhVh(exp, expbias); + + // // HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + // // HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + // // HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN); + // // HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + // // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + // // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + // // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + // // HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man); + // // man = Q6_V_vnot_V(man); + // // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + // // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // // HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1); + // // HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // // // exp >= 0 + // // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); + // // tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // // // exp < 0 (-1, 1) + // // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + // // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // // sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + // // } + + // sout2_low = Q6_Vw_equals_Vsf(sout2_low); + // sout2_low = Q6_Vw_vasr_VwR(sout2_low, ROUND_2_SCALE); + // sout2_high = Q6_Vw_equals_Vsf(sout2_high); + // sout2_high = Q6_Vw_vasr_VwR(sout2_high, ROUND_2_SCALE); + + // HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout1_high, sout1_low); + // HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout2_high, sout2_low); + // HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h); + + // *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert); + // } + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + o_scale_vec = Q6_V_vsplat_R(float_to_fp16s(o_scale)); + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + HVX_Vector sline1_high; + HVX_Vector sline1_low; + HVX_Vector sline2_high; + HVX_Vector sline2_low; + + { + // dequantize sline1 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec); + sline1_low = Q6_Vhf_equals_Vqf16(sline1_low); + sline1_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec); + sline1_high = Q6_Vhf_equals_Vqf16(sline1_high); + } + + { + // dequantize sline2 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), b_scale_vec); + sline2_low = Q6_Vhf_equals_Vqf16(sline2_low); + sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), b_scale_vec); + sline2_high = Q6_Vhf_equals_Vqf16(sline2_high); + } + + { + // silu sline1_low + tmp_v = Q6_Vh_vdeal_Vh(sline1_low); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry information + * about integer index. Use the same input_scaled_v vector for hf and qf16 + * representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + { + // silu sline1_high + tmp_v = Q6_Vh_vdeal_Vh(sline1_high); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry information + * about integer index. Use the same input_scaled_v vector for hf and qf16 + * representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + HVX_Vector sline_high; + HVX_Vector sline_low; + + { + // mul + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high); + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low); + + sline_high = Q6_Vhf_equals_Vqf16(sline_high); + sline_low = Q6_Vhf_equals_Vqf16(sline_low); + } + + { + // quantize + HVX_Vector sout1 = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec); + sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec); + sout1 = Q6_Vhf_equals_Vqf16(sout1); + sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec); + sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec); + + { + HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA); + exp = Q6_V_vand_VV(exp, expmask); + exp = Q6_Vh_vsub_VhVh(exp, expbias); + + HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN); + HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man); + man = Q6_V_vnot_V(man); + HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1); + HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // exp >= 0 + HVX_Vector tsout1 = + Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // exp < 0 (-1, 1) + HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + } + + sout1 = Q6_Vh_equals_Vhf(sout1); + + HVX_Vector sout2 = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec); + sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec); + sout2 = Q6_Vhf_equals_Vqf16(sout2); + sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec); + sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec); + + { + HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA); + exp = Q6_V_vand_VV(exp, expmask); + exp = Q6_Vh_vsub_VhVh(exp, expbias); + + HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN); + HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man); + man = Q6_V_vnot_V(man); + HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1); + HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // exp >= 0 + HVX_Vector tsout1 = + Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // exp < 0 (-1, 1) + HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + } + + sout2 = Q6_Vh_equals_Vhf(sout2); + + HVX_Vector reql_h = Q6_Vb_vpack_VhVh_sat(sout2, sout1); + *optr++ = Q6_Vb_vadd_VbVb(reql_h, uintconvert); + } + } + + // // Handle leftover elements. + // if (leftover_size > 0) { + // sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) + // ? sline1p + // : *iptr++); + // sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + // sline2c = (is_in_one_chunk(iptr2, leftover_size, VLEN) + // ? sline2p + // : *iptr2++); + // sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + // vstu_variable(optr, leftover_size, + // Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2))); + // } + + return 0; +} + +template +GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, + const PlainFloatTensor &a_scale, + const PlainFloatTensor &b_scale, + const PlainFloatTensor &o_scale) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + size_t size = b_in * h_in * w_in * d_in; + + float a_scale_ = a_scale(0, 0, 0, 0); + float b_scale_ = b_scale(0, 0, 0, 0); + float o_scale_ = o_scale(0, 0, 0, 0); + + auto in_ptr = (uint8_t *)in_0.raw_data_const(); + auto in_ptr2 = (uint8_t *)in_1.raw_data_const(); + + auto out_ptr = (uint8_t *)out_0.raw_data(); + + DType dtype = in_0.get_dtype(); + + if (dtype == DType::QUInt8 && out_0.get_dtype() == DType::QUInt8) { + hvx_supersilu_ahf(in_ptr, in_ptr2, out_ptr, a_scale_, b_scale_, + 1.0f / o_scale_, size); + } + + return GraphStatus::Success; +} + +#else + +template +GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &in_1, + const PlainFloatTensor &a_scale, + const PlainFloatTensor &b_scale, + const PlainFloatTensor &o_scale) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + out_0.set_dims(in_0); + + float a_scale_ = a_scale(0, 0, 0, 0); + float b_scale_ = b_scale(0, 0, 0, 0); + float o_scale_ = o_scale(0, 0, 0, 0); + + auto in_ptr = (uint8_t *)in_0.raw_data_const(); + auto in_ptr2 = (uint8_t *)in_1.raw_data_const(); + + auto out_ptr = (uint8_t *)out_0.raw_data(); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // mul + for (Idx d = 0; d < d_in; d++) { + + int32_t a_inval = static_cast(*in_ptr++); + float a_inval_fp16 = (a_inval - 128) * a_scale_; + + int32_t b_inval = static_cast(*in_ptr2++); + float b_inval_fp16 = (b_inval - 128) * b_scale_; + + a_inval_fp16 = a_inval_fp16 * (1 / (1 + expf(-a_inval_fp16))); + + float inval = a_inval_fp16 * b_inval_fp16; + + long v = lroundf(inval / o_scale_); + + if (v > 127) + v = 127; + + if (v < -128) + v = -128; + + v += 128; + + *out_ptr++ = static_cast(v); + } + } + } + } + + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float llamasupersiluCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp new file mode 100644 index 000000000..7c3480944 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp @@ -0,0 +1,350 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_QLayerNorm); + +// op execute function declarations +template +GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights, const TensorType &bias); + +// forward declaration of sample cost function +static float qlayernormCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((qlayernormImpl), "QLayerNorm") + */ +DEF_PACKAGE_OP((qlayernormImpl), "QLayerNorm") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((qlayernormImpl), + * "QLayerNorm", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((qlayernormImpl), + * "QLayerNorm", qlayernormCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +int32_t hvx_qlayernorm_af(float *restrict input, float *restrict weights, + float *restrict bias, float *restrict output, + uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_Vector *iptr3 = (HVX_Vector *)bias; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + HVX_Vector sline3p, sline3c, sline3; + + HVX_Vector zero; + + float __attribute__((aligned(VLEN))) tmp_buf[32]; + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + + zero = Q6_V_vzero(); + + // sline1p = *iptr++; + + // x sum + HVX_Vector xsum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, sline1); + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, sline1); + } + + union { + float f; + uint32_t ui; + } mean_value; + mean_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) { + xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, Q6_V_vlalign_VVR(xsum, zero, i)); + } + + xsum = Q6_Vsf_equals_Vqf32(xsum); + *(HVX_Vector *)tmp_buf = xsum; + + mean_value.f = xsum[31] / size; + + // x-e^2 sum + iptr = (HVX_Vector *)input; + sline1p = *iptr++; + + HVX_Vector x2sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + + HVX_Vector mean_vsf = Q6_V_vsplat_R(mean_value.ui); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf); + x2sum = Q6_Vqf32_vadd_Vqf32Vqf32( + x2sum, Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline1)); + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf); + x2sum = + Q6_Vqf32_vadd_Vqf32Vqf32(x2sum, Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline1)); + } + + float epsilon_ = 1e-5; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) { + x2sum = Q6_Vqf32_vadd_Vqf32Vqf32(x2sum, Q6_V_vlalign_VVR(x2sum, zero, i)); + } + + x2sum = Q6_Vsf_equals_Vqf32(x2sum); + *(HVX_Vector *)tmp_buf = x2sum; + + sum_value.f = 1.0f / sqrtf(x2sum[31] / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + sline1p = *iptr++; + sline2p = *iptr2++; + sline3p = *iptr3++; + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr3 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline3c = *iptr3++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights); + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)bias); + + sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline2); + middle_value_qf32 = + Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + middle_value_qf32 = Q6_Vqf32_vadd_Vqf32Vqf32(middle_value_qf32, sline3); + + *optr++ = Q6_Vsf_equals_Vqf32(middle_value_qf32); + + sline1p = sline1c; + sline2p = sline2c; + sline3p = sline3c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights); + + sline3c = is_aligned(iptr3, VLEN) && leftover == 0 ? sline3p : *iptr3++; + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)weights); + + sline1 = Q6_Vqf32_vsub_VsfVsf(sline1, mean_vsf); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline2); + middle_value_qf32 = + Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + middle_value_qf32 = Q6_Vqf32_vadd_Vqf32Vqf32(middle_value_qf32, sline3); + + *optr++ = Q6_Vsf_equals_Vqf32(middle_value_qf32); + } + + if (leftover_size > 0) + return -1; + + return 0; +} + +template +GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights, const TensorType &bias) + +{ + out_0.set_dims(in_0); + + // NHWC + + auto in_ptr = (float *)in_0.raw_data_const(); + auto out_ptr = (float *)out_0.raw_data(); + auto weights_ptr = (float *)weights.raw_data_const(); + auto bias_ptr = (float *)bias.raw_data_const(); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + hvx_qlayernorm_af(in_ptr, weights_ptr, bias_ptr, out_ptr, d_in); + + in_ptr += d_in; + out_ptr += d_in; + } + } + } + + return GraphStatus::Success; +} + +#else + +template +GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights, const TensorType &bias) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float qlayernormCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_QLayerNorm); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp new file mode 100644 index 000000000..922cb8ff2 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp @@ -0,0 +1,939 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_RMSNorm); + +// op execute function declarations +template +GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights); + +// forward declaration of sample cost function +static float rmsnormCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((rmsnormImpl), "RMSNorm") + */ +DEF_PACKAGE_OP((rmsnormImpl), "RMSNorm") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((rmsnormImpl), "RMSNorm", + * SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((rmsnormImpl), + * "RMSNorm", rmsnormCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +int32_t hvx_rmsnorm_af(float *restrict input, float *restrict weights, + float *restrict output, uint32_t size) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + + // ^2 sum + HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + } + + float epsilon_ = 1e-6; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + HVX_Vector zero = Q6_V_vzero(); + + for (int32_t i = 64; i >= 4; i >>= 1) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i)); + } + + sum = Q6_Vsf_equals_Vqf32(sum); + sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + sline1p = *iptr++; + sline2p = *iptr2++; + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, sline2); + *optr++ = Q6_Vsf_equals_Vqf32( + Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32)); + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, sline2); + *optr++ = Q6_Vsf_equals_Vqf32( + Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32)); + } + + if (leftover_size > 0) + return -1; + + return 0; +} + +static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) { + union { + float f; + uint32_t i; + } fp32 = {.f = x}; + return fp32.i; +} + +static inline int32_t float_to_fp16s(float input) { + union { + int32_t i; + __fp16 f[2]; + } fp32 = {.f = {(__fp16)input, (__fp16)input}}; + return fp32.i; +} + +#define FLOAT_MANTISA 23 +#define FLOAT_EXPONENT_MASK 0xff +#define FLOAT_EXPONENT_BIAS 0x7f +#define FLOAT_MANTISA_MASK 0x007fffff +#define FLOAT_SIGN 31 +#define FLOAT_NEG_1 0xBF800000 +#define ROUND_2_SCALE 22 +#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f) + +int32_t hvx_rmsnorm_auint8(float *restrict input, float *restrict weights, + uint8_t *restrict output, uint32_t size, + float scale) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + HVX_Vector sline3p, sline3c, sline3; + HVX_Vector sline4p, sline4c, sline4; + HVX_Vector slinewp, slinewc, slinew; + + HVX_Vector sout1, sout2, sout3, sout4; + HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, round_scale_vec; + + float low_level = -128.0f; + float high_level = 127.0f; + + float es = 0.5f; + low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level)); + high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level)); + scale_vec = Q6_V_vsplat_R(float_to_bits(scale)); + es_vec = Q6_V_vsplat_R(float_to_bits(es)); + round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + HVX_Vector zero_v_sf = Q6_V_vzero(); + scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf); + es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf); + + HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + + // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); + // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); + // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); + // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); + // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); + // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + // int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + + // ^2 sum + HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + } + + float epsilon_ = 1e-6; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i)); + } + + sum = Q6_Vsf_equals_Vqf32(sum); + sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + + sline1p = *iptr++; + sline2p = *iptr++; + sline3p = *iptr++; + sline4p = *iptr++; + + slinewp = *iptr2++; + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; j += 4) { + + { + sline1c = *iptr++; + slinewc = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew); + sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + sout1 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, scale_vec); + sout1 = Q6_Vqf32_vadd_Vqf32Vqf32(sout1, es_vec); + sout1 = Q6_Vsf_equals_Vqf32(sout1); + sout1 = Q6_Vsf_vmin_VsfVsf(sout1, high_level_vec); + sout1 = Q6_Vsf_vmax_VsfVsf(sout1, low_level_vec); + sout1 = Q6_Vqf32_vmpy_VsfVsf(sout1, round_scale_vec); + sout1 = Q6_Vsf_equals_Vqf32(sout1); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout1, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout1, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout1, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout1, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + // } + + sout1 = Q6_Vw_equals_Vsf(sout1); + sout1 = Q6_Vw_vasr_VwR(sout1, ROUND_2_SCALE); + // sout1 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout1, + // Q6_V_vzero()), 0); + + { + sline2c = *iptr++; + slinewc = *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew); + sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + sout2 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline2, scale_vec); + sout2 = Q6_Vqf32_vadd_Vqf32Vqf32(sout2, es_vec); + sout2 = Q6_Vsf_equals_Vqf32(sout2); + sout2 = Q6_Vsf_vmin_VsfVsf(sout2, high_level_vec); + sout2 = Q6_Vsf_vmax_VsfVsf(sout2, low_level_vec); + sout2 = Q6_Vqf32_vmpy_VsfVsf(sout2, round_scale_vec); + sout2 = Q6_Vsf_equals_Vqf32(sout2); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout2, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout2, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout2, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout2, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + // } + + sout2 = Q6_Vw_equals_Vsf(sout2); + sout2 = Q6_Vw_vasr_VwR(sout2, ROUND_2_SCALE); + // sout2 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout2, + // Q6_V_vzero()), 0); + + { + sline3c = *iptr++; + slinewc = *iptr2++; + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew); + sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + sout3 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline3, scale_vec); + sout3 = Q6_Vqf32_vadd_Vqf32Vqf32(sout3, es_vec); + sout3 = Q6_Vsf_equals_Vqf32(sout3); + sout3 = Q6_Vsf_vmin_VsfVsf(sout3, high_level_vec); + sout3 = Q6_Vsf_vmax_VsfVsf(sout3, low_level_vec); + sout3 = Q6_Vqf32_vmpy_VsfVsf(sout3, round_scale_vec); + sout3 = Q6_Vsf_equals_Vqf32(sout3); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout3, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout3, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout3, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout3, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout3, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout3, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout3, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout3, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout3 = Q6_V_vmux_QVV(expgte23, sout3, tsout1); + // } + + sout3 = Q6_Vw_equals_Vsf(sout3); + sout3 = Q6_Vw_vasr_VwR(sout3, ROUND_2_SCALE); + // sout3 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout3, + // Q6_V_vzero()), 0); + + { + sline4c = *iptr++; + slinewc = *iptr2++; + sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew); + sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + sout4 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline4, scale_vec); + sout4 = Q6_Vqf32_vadd_Vqf32Vqf32(sout4, es_vec); + sout4 = Q6_Vsf_equals_Vqf32(sout4); + sout4 = Q6_Vsf_vmin_VsfVsf(sout4, high_level_vec); + sout4 = Q6_Vsf_vmax_VsfVsf(sout4, low_level_vec); + sout4 = Q6_Vqf32_vmpy_VsfVsf(sout4, round_scale_vec); + sout4 = Q6_Vsf_equals_Vqf32(sout4); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout4, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout4, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout4, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout4, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout4, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout4, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, + // exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout4, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout4, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout4 = Q6_V_vmux_QVV(expgte23, sout4, tsout1); + // } + + sout4 = Q6_Vw_equals_Vsf(sout4); + sout4 = Q6_Vw_vasr_VwR(sout4, ROUND_2_SCALE); + // sout4 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout4, + // Q6_V_vzero()), 0); + + HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout2, sout1); + HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout4, sout3); + HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h); + + *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert); + + sline1p = sline1c; + sline2p = sline2c; + sline3p = sline3c; + sline4p = sline4c; + + slinewp = slinewc; + } + } + + return 0; +} + +int32_t hvx_rmsnorm_auint8_opt(float *restrict input, float *restrict weights, + uint8_t *restrict output, uint32_t size, + float scale) { + if ((input == NULL) || (output == NULL) || (size == 0)) { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + HVX_Vector sline3p, sline3c, sline3; + HVX_Vector sline4p, sline4c, sline4; + HVX_Vector slinewp, slinewc, slinew; + + // HVX_Vector sout1, sout2, sout3, sout4; + // HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, + // round_scale_vec; + + // float low_level = -128.0f; + // float high_level = 127.0f; + + // float es = 0.5f; + // low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level)); + // high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level)); + // scale_vec = Q6_V_vsplat_R(float_to_bits(scale)); + // es_vec = Q6_V_vsplat_R(float_to_bits(es)); + // round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + // HVX_Vector zero_v_sf = Q6_V_vzero(); + // scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf); + // es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf); + + // HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + + // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); + // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); + // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); + // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); + // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); + // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + // int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + + // ^2 sum + HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + } + + float epsilon_ = 1e-6; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i)); + } + + sum = Q6_Vsf_equals_Vqf32(sum); + sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + + sline1p = *iptr++; + sline2p = *iptr++; + sline3p = *iptr++; + sline4p = *iptr++; + + slinewp = *iptr2++; + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + float post_scale_flt = scale / 64.0f; + int scexp = flt_getexp(post_scale_flt); + int rsh = min_i32(-scexp, 7); // e.g. 0.11 -> 0.88, rsh = 3 + float rsh_fac = flt_power2(rsh); + + int adj_bias = roundf_i32(128 * rsh_fac); + adj_bias = Q6_R_combine_RlRl(adj_bias, adj_bias); + + HVX_Vector zero_v_sf = Q6_V_vzero(); + float es = 0.5f; + HVX_Vector es_vec = Q6_V_vsplat_R(float_to_fp16s(es)); + es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf); + + HVX_Vector vadj = Q6_V_vsplat_R(adj_bias); + HVX_Vector o_scale_vec = + Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1 << 15))); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; j += 4) { + + { + sline1c = *iptr++; + slinewc = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew); + sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + { + sline2c = *iptr++; + slinewc = *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew); + sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + HVX_Vector sline_low = + Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline2, sline1)); + sline_low = Q6_Vqf16_vadd_Vqf16Vqf16(sline_low, es_vec); + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec); + sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low)); + sline_low = Q6_Vh_vadd_VhVh_sat( + Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_low, 0x00400040), vadj); + + sline_low = Q6_Vh_vdeal_Vh(sline_low); + + { + sline3c = *iptr++; + slinewc = *iptr2++; + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew); + sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + { + sline4c = *iptr++; + slinewc = *iptr2++; + sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew); + sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + HVX_Vector sline_high = + Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline4, sline3)); + sline_high = Q6_Vqf16_vadd_Vqf16Vqf16(sline_high, es_vec); + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec); + sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high)); + sline_high = Q6_Vh_vadd_VhVh_sat( + Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_high, 0x00400040), vadj); + + sline_high = Q6_Vh_vdeal_Vh(sline_high); + + HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat(sline_high, sline_low, rsh); + sout = Q6_Vb_vdeal_Vb(sout); + *optr++ = sout; + + sline1p = sline1c; + sline2p = sline2c; + sline3p = sline3c; + sline4p = sline4c; + + slinewp = slinewc; + } + } + + return 0; +} + +template +GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights) + +{ + out_0.set_dims(in_0); + + // NHWC + + auto in_ptr = (float *)in_0.raw_data_const(); + auto weights_ptr = (float *)weights.raw_data_const(); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + DType dtype = out_0.get_dtype(); + + if (dtype == DType::Float32) { + + auto out_ptr = (float *)out_0.raw_data(); + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + hvx_rmsnorm_af(in_ptr, weights_ptr, out_ptr, d_in); + + in_ptr += d_in; + out_ptr += d_in; + } + } + } + + } else if (dtype == DType::QUInt8) { + + auto out_ptr = (uint8_t *)out_0.raw_data(); + float scale_ = out_0.get_interface_scale(); + + scale_ = 1.0f / scale_; + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + hvx_rmsnorm_auint8(in_ptr, weights_ptr, out_ptr, d_in, scale_); + + in_ptr += d_in; + out_ptr += d_in; + } + } + } + } + + return GraphStatus::Success; +} + +#else + +template +GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0, + const TensorType &weights) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + // NHWC + + float epsilon_ = 1e-6; + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + float sum_squares = 0.0f; + for (Idx d = 0; d < d_in; d++) { + float inval = in_0(b, h, w, d); + sum_squares += inval * inval; + } + + // debuglog("silu execute... sum_squares=(%f)", sum_squares); + + float rms = sqrtf(sum_squares / d_in + epsilon_); + debuglog("rms execute... sum_squares=(%f)", 1.0f / rms); + debuglog("rms execute... sum_squares=(%f)", sum_squares); + + for (Idx d = 0; d < d_in; d++) { + float inval = in_0(b, h, w, d); + float weight = weights(0, 0, 0, d); + + out_0(b, h, w, d) = inval * weight / rms; + } + } + } + } + + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float rmsnormCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_RMSNorm); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp new file mode 100644 index 000000000..73f1ee050 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp @@ -0,0 +1,1425 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_SiLU); + +// op execute function declarations +template +GraphStatus siluImpl(TensorType &out_0, const TensorType &in_0); + +// forward declaration of sample cost function +static float siluCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((siluImpl), "SiLU") + */ +DEF_PACKAGE_OP((siluImpl), "SiLU") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((siluImpl), "SiLU", + * SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((siluImpl), + * "SiLU", siluCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ +#ifndef REFERENCE_OP + +#include "hvx_internal.h" +#include "qhmath_hvx.h" +#include +#include + +#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +static inline int32_t float_to_fp16s(float input) { + union { + int32_t i; + __fp16 f[2]; + } fp32 = {.f = {(__fp16)input, (__fp16)input}}; + return fp32.i; +} + +static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) { + union { + float f; + uint32_t i; + } fp32 = {.f = x}; + return fp32.i; +} + +/* Polynomial coefficients */ +static const float c0_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.1329913082916337, + 0.22308514882873062, + 0.347752862580421, + 0.4845759228057826, + 0.5724725619240282, + 0.5532613332075828, + 0.5041402176920755, + 0.4999998945071365, + 0.500005251569411, + 0.494975832882496, + 0.44426898861108216, + 0.42865769845972046, + 0.5186084804556764, + 0.6556781472810073, + 0.7780379623543565, + 0.8670752648575938, +}; +static const float c1_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0595948414501292, + 0.11153317908159224, + 0.19545701719511055, + 0.3058925677063833, + 0.3932668307015573, + 0.3630691859433203, + 0.26302954631996744, + 0.2499155333713503, + 0.24983690256810576, + 0.26551386754654915, + 0.3670764533308477, + 0.39196882072648825, + 0.3030372911476408, + 0.19296191313371913, + 0.11084562978488391, + 0.059559556604464964, +}; +static const float c2_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.010207999856103376, + 0.02144807112969563, + 0.04266485934992188, + 0.07616157468726052, + 0.10882760873715347, + 0.09125379784995667, + 0.013872106909816257, + -0.0008786208359828815, + 0.0011993845621092196, + -0.01645080326288375, + -0.09367947263571219, + -0.10827006684348266, + -0.07520301291634655, + -0.04198514892887826, + -0.021290356584896874, + -0.010200991240527542, +}; +static const float c3_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0007896351019423816, + 0.0018718593077865326, + 0.004259190313167949, + 0.008784166436796144, + 0.014228201960903939, + 0.009727536748893095, + -0.01721317464724529, + -0.023762851116001377, + -0.02424226654277249, + -0.01604104065157868, + 0.010376786273973133, + 0.014122038833203628, + 0.008641365746408176, + 0.004176981844803722, + 0.0018557930308154783, + 0.0007890167735032168, +}; +static const float c4_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.3213858349988003e-05, + 6.232838199801025e-05, + 0.0001632037964535633, + 0.0003928983460811959, + 0.0007341577078787206, + 0.0003053082875419616, + -0.003254838747910248, + -0.004021655986643196, + 0.004258314078650583, + 0.0030578644020607566, + -0.00037014803880675387, + -0.0007265964578827031, + -0.0003849331969038772, + -0.00015947916435728337, + -6.171511304866758e-05, + -2.319341439172678e-05, +}; + +/** + * @brief Polynomial approximation of x/(exp(-x)+1.0) function. + * @param[in] input Input array of elements in IEEE 32-bit floating-point + * format. + * @param[out] output Output array of elements in IEEE 32-bit floating-point + * format. + * @param[in] length Number of elements in input/output arrays. + * @return Returns 0 on successful execution. Otherwise -1. + */ +int32_t hvx_silu_af(float *restrict input, float *restrict output, + uint32_t size) { + HVX_Vector *input_v_ptr; + HVX_UVector *output_v_ptr; + HVX_Vector input_min_v_f; + HVX_Vector input_shifted_v_qf32; + HVX_Vector input_scaled_v_qf32; + HVX_Vector scale_v; + HVX_Vector input_v_qf32; + HVX_Vector const16_0_v_sf; + HVX_Vector zero_v_sf; + HVX_Vector mask_idx1_v, mask_idx2_v; + HVX_Vector tmp_v, idx1_v, idx2_v; + HVX_Vector output_v; + HVX_Vector slinep; + HVX_Vector slinec; + HVX_Vector sline; + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + int32_t leftover_size = leftover * sizeof(float); + HVX_DV c0_coeff_dv; + HVX_VectorPair c0_coeff_vp; + HVX_Vector c0_coeff_v; + HVX_DV c1_coeff_dv; + HVX_VectorPair c1_coeff_vp; + HVX_Vector c1_coeff_v; + HVX_DV c2_coeff_dv; + HVX_VectorPair c2_coeff_vp; + HVX_Vector c2_coeff_v; + HVX_DV c3_coeff_dv; + HVX_VectorPair c3_coeff_vp; + HVX_Vector c3_coeff_v; + HVX_DV c4_coeff_dv; + HVX_VectorPair c4_coeff_vp; + HVX_Vector c4_coeff_v; + + HVX_Vector f8, f_8; + + /* Check input arguments. Return error status if some argument has invalid + * value */ + if ((input == 0) || (output == 0) || (size == 0)) { + return -1; + } + + input_v_ptr = (HVX_Vector *)input; + output_v_ptr = (HVX_UVector *)output; + + f8 = Q6_V_vsplat_R(float_to_bits(8.0f)); + f_8 = Q6_V_vsplat_R(float_to_bits(-8.0f)); + + /* + * If input data is not aligned to HVX vector size, compose aligned vectors + * from data loaded in slinep and slinec + */ + slinep = *input_v_ptr++; + + /* + * Splat scale factor in order to be used later for finding indexes of + * coefficients. Scale factor is represented in IEEE 16-bit floating-point + * format and it is calculated using the following formula: scale_factor = + * (16.0 / (b0 - a0)) NOTE: Calculated value is slightly decreased in order to + * avoid out of bound indexes during VLUT lookup. + */ + scale_v = Q6_V_vsplat_R(0x3f7ffffe); + + /* + * Vector of zeroes used as neutral element in sf to qf32 conversions. + * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) + * can be avoided in real-time, but this is not done in order to don't + * sacrify code readibility in expense of insignificant performance + * improvement. + */ + zero_v_sf = Q6_V_vzero(); + + /* Mask for extracting only 4 bits of mantissa */ + mask_idx1_v = Q6_V_vsplat_R(0x0000000F); + mask_idx2_v = Q6_V_vsplat_R(0x00000010); + + /* 16.0 in IEEE 16-bit floating-point representation */ + const16_0_v_sf = Q6_V_vsplat_R(0x41800000); + + /* + * Prepare vector of input_min values, that is used later in shifting input + * range. input_min is low boundary of specified input range. + */ + input_min_v_f = Q6_V_vsplat_R(0xc1000000); + + /* Convert scale factor from sf to q32. Use the same vector for both formats + */ + scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf); + + /* Load coefficients */ + c0_coeff_v = *((HVX_Vector *)(c0_coeffs)); + c1_coeff_v = *((HVX_Vector *)(c1_coeffs)); + c2_coeff_v = *((HVX_Vector *)(c2_coeffs)); + c3_coeff_v = *((HVX_Vector *)(c3_coeffs)); + c4_coeff_v = *((HVX_Vector *)(c4_coeffs)); + + /* Convert coefficients from sf to qf32 format. Use the same vector for both + * representations */ + c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf); + c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf); + c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf); + c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf); + c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf); + + /* Split 32-bit coefficients to lower and upper part in order to obtain them + * later with VLUT16. */ + c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); + c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); + c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); + c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); + c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); + + /* + * Handle number of whole vectors in input data. + * Don't process last vector in order to avoid out-of-boundary load. + */ + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + /* Process one vector at a time */ + for (int32_t j = 0; j < block; ++j) { + slinec = *input_v_ptr++; + + /* Compose vector of input data from slinec and slinep */ + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v_qf32 = + Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry information + * about integer index. + */ + input_scaled_v_qf32 = + Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + // x * sigmod + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v); + + HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v); + + HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8); + out_v = Q6_V_vmux_QVV(islf8, sline, out_v); + + HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline); + out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v); + + /* Store results to the output buffer and convert from qf32 to sf */ + *((HVX_UVector *)(output_v_ptr++)) = out_v; + + /* Prepare slinep for next iteration */ + slinep = slinec; + } + } + + /* Handle last whole vector from input data */ + if (vectors_in_rounddown > 0) { + slinec = + is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v_qf32 = + Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + */ + input_scaled_v_qf32 = + Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + // x * sigmod + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v); + + HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v); + + HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8); + out_v = Q6_V_vmux_QVV(islf8, sline, out_v); + + HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline); + out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v); + + /* Convert from qf32 to sf, store output and go to handle leftover */ + *((HVX_UVector *)(output_v_ptr++)) = out_v; + + slinep = slinec; + } + + /* Handle leftover elements */ + if (leftover > 0) { + slinec = + (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v_qf32 = + Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + */ + input_scaled_v_qf32 = + Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf); + + /* Convert back from qf32 to sf in order to extract integer index */ + tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from sf vector to qf32 vector for Horner's method*/ + input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf); + + /* Perform evaluation of polynomial using Horner's method */ + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp)); + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32); + output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp)); + + // x * sigmod + output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v); + + HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v); + + HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8); + out_v = Q6_V_vmux_QVV(islf8, sline, out_v); + + HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline); + out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v); + + /* Store output */ + vstu_variable(output_v_ptr, leftover_size, out_v); + } + + return 0; +} + +static const float fp16_c0_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.13239719960243818, + 0.2216255210749415, + 0.3447664743728659, + 0.48137452032585476, + 0.5716299228719798, + 0.5547323231605259, + 0.5046287748870234, + 0.4999985574626892, + 0.5000036514755082, + 0.49475652448004626, + 0.4441393352532763, + 0.428500379952032, + 0.5173297285470642, + 0.6541461039833616, + 0.7783931007462818, + 0.8678015179911097, +}; +static const float fp16_c1_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.05928005756790343, + 0.11063222460270064, + 0.1932879057003057, + 0.30302440212086995, + 0.3922924462181049, + 0.36546332659415875, + 0.2644148210990377, + 0.24989020912329707, + 0.2498532691910313, + 0.2661055781198988, + 0.36728015359480604, + 0.39215270010450015, + 0.3041825601732039, + 0.1940762094668647, + 0.11061794856987572, + 0.059174800917353595, +}; +static const float fp16_c2_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.010145494303219278, + 0.02123968384425681, + 0.04207468332514667, + 0.07519946712591977, + 0.10840620196267145, + 0.09270738184406795, + 0.015322371881818012, + -0.0009948273994921822, + 0.0011544907060402412, + -0.017040517565094934, + -0.09379878876657094, + -0.10835043868732394, + -0.07558705272699548, + -0.04228875316413285, + -0.021235740718738055, + -0.010124599879590107, +}; +static const float fp16_c3_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0007841223015974933, + 0.001850453397354219, + 0.004187899308371771, + 0.008640952434084206, + 0.01414741414964877, + 0.010117749275618, + -0.01654848996354919, + -0.02395108399453624, + -0.024199111971064446, + -0.015783556879607072, + 0.010407672131558174, + 0.014137608186323335, + 0.008698510795258909, + 0.004213708431213342, + 0.0018499827774393985, + 0.0007822799742289481, +}; +static const float fp16_c4_coeffs[32] __attribute__((aligned(VLEN))) = { + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.3031641204975905e-05, + 6.150442488966733e-05, + 0.00015997783736818624, + 0.00038491646239693526, + 0.0007283649599237781, + 0.00034439150914392054, + -0.003142246198646662, + -0.004120389580321761, + 0.004246050162553198, + 0.0030162727520777893, + -0.00037312974308425725, + -0.0007277242855014247, + -0.00038811687679772674, + -0.0001611434776868886, + -6.14837984586862e-05, + -2.297076123375133e-05, +}; + +/** + * @brief Polynomial approximation of 1.0/(exp(-x)+1.0) function. + * @param[in] input Input array of elements in IEEE 16-bit floating-point + * format. + * @param[out] output Output array of elements in IEEE 16-bit floating-point + * format. + * @param[in] length Number of elements in input/output arrays. + * @return Returns 0 on successful execution. Otherwise -1. + */ +int32_t hvx_silu_ahf(__fp16 *restrict input, __fp16 *restrict output, + uint32_t size) { + HVX_Vector *input_v_ptr; + HVX_UVector *output_v_ptr; + HVX_Vector input_min_v_hf; + HVX_Vector input_shifted_v_hf; + HVX_Vector input_scaled_v; + HVX_VectorPair input_vp_qf32; + // HVX_Vector input_v_qf16; + HVX_Vector mask_idx1_v, mask_idx2_v; + HVX_Vector const16_0_v_hf; + HVX_Vector zero_v_hf, one_v_hf; + HVX_Vector tmp_v; + HVX_Vector idx1_v, idx2_v; + HVX_Vector scale_v; + HVX_DV output_dv; + // HVX_Vector output_v; + HVX_Vector slinep, slinec, sline; + HVX_Vector sout; + int32_t block, l2fetch_block; + int32_t leftover = size & 63; + int32_t vectors_in_rounddown = size / 64; + int32_t leftover_size = leftover * sizeof(__fp16); + HVX_DV c0_coeff_dv; + HVX_VectorPair c0_coeff_vp; + HVX_Vector c0_coeff_v; + HVX_DV c1_coeff_dv; + HVX_VectorPair c1_coeff_vp; + HVX_Vector c1_coeff_v; + HVX_DV c2_coeff_dv; + HVX_VectorPair c2_coeff_vp; + HVX_Vector c2_coeff_v; + HVX_DV c3_coeff_dv; + HVX_VectorPair c3_coeff_vp; + HVX_Vector c3_coeff_v; + HVX_DV c4_coeff_dv; + HVX_VectorPair c4_coeff_vp; + HVX_Vector c4_coeff_v; + + /* Check input arguments. Return error status if some argument has invalid + * value */ + if ((input == 0) || (output == 0) || (size == 0)) { + return -1; + } + + input_v_ptr = (HVX_Vector *)input; + output_v_ptr = (HVX_UVector *)output; + + /* + * If input data is not aligned to HVX vector size, compose aligned vectors + * from data loaded in slinep and slinec + */ + slinep = *input_v_ptr++; + + /* + * Splat scale factor in order to be used later for finding indexes of + * coefficients. Scale factor is represented in IEEE 16-bit floating-point + * format and it is calculated using the following formula: scale_factor = + * (convert_sf_to_hf) (16.0 / (b0 - a0)) NOTE: Calculated value is slightly + * decreased in order to avoid out of bound indexes during VLUT lookup. + */ + scale_v = Q6_Vh_vsplat_R(0x3bfe); + + /* Vector of ones used as mpy neutral element in conversions from hf vector to + * qf32 vector pair */ + one_v_hf = Q6_Vh_vsplat_R(0x3c00); + + /* + * Vector of zeroes used as neutral element in hf to qf16 conversions. + * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) + * can be avoided in real-time, but this is not done in order to don't + * sacrify code readibility in expense of insignificant performance + * improvement. + */ + zero_v_hf = Q6_V_vzero(); + + /* Mask for extracting only 4 bits of mantissa */ + mask_idx1_v = Q6_Vh_vsplat_R(0x000F); + + mask_idx2_v = Q6_V_vsplat_R(0x00001010); + + /* 16.0 in IEEE 16-bit floating-point representation */ + const16_0_v_hf = Q6_Vh_vsplat_R(0x4c00); + + /* + * Prepare vector of input_min values, that is used later in shifting input + * range. input_min is low boundary of specified input range. + */ + input_min_v_hf = Q6_Vh_vsplat_R(0xc800); + + /* Convert scale factor from hf to q16. Use the same vector for both formats + */ + scale_v = Q6_Vqf16_vadd_VhfVhf(scale_v, zero_v_hf); + + /* Load coefficients */ + c0_coeff_v = *((HVX_Vector *)(fp16_c0_coeffs)); + c1_coeff_v = *((HVX_Vector *)(fp16_c1_coeffs)); + c2_coeff_v = *((HVX_Vector *)(fp16_c2_coeffs)); + c3_coeff_v = *((HVX_Vector *)(fp16_c3_coeffs)); + c4_coeff_v = *((HVX_Vector *)(fp16_c4_coeffs)); + + /* Convert coefficients from hf to qf32 format. Use the same vector for both + * representations */ + c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_hf); + c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_hf); + c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_hf); + c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_hf); + c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_hf); + + /* Split 32-bit coefficients to lower and upper part in order to obtain them + * later with VLUT16. */ + c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); + c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); + c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); + c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); + c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); + + /* + * Handle number of whole vectors in input data. + * Don't process last vector in order to avoid out-of-boundary load. + */ + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) { + l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + /* Process one vector at a time */ + for (int32_t j = 0; j < block; ++j) { + slinec = *input_v_ptr++; + + /* Compose vector of input data from slinec and slinep */ + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + tmp_v = Q6_Vh_vdeal_Vh(sline); + + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from + * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer + * values. Float values, represented in IEEE 754, in range [16.0,32.0] + * have the same exponent, which means 4 MSB of mantissa carry information + * about integer index. Use the same input_scaled_v vector for hf and qf16 + * representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, + Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, + Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, + Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, + Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, + Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // input_v_qf16 = Q6_Vqf16_vmpy_VhfVhf(sline, one_v_hf); + + // output_v = Q6_Vqf16_vmpy_Vqf16Vhf(input_v_qf16, + // Q6_Vhf_equals_Wqf32(c4_coeff_vp)); output_v = + // Q6_Vqf16_vadd_Vqf16Vhf(output_v, Q6_Vhf_equals_Wqf32(c3_coeff_vp)); + // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16); + // output_v = Q6_Vqf16_vadd_Vqf16Vhf(output_v, + // Q6_Vhf_equals_Wqf32(c2_coeff_vp)); output_v = + // Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16); output_v = + // Q6_Vqf16_vadd_Vqf16Vhf(output_v, Q6_Vhf_equals_Wqf32(c1_coeff_vp)); + // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16); + // output_v = Q6_Vqf16_vadd_Vqf16Vhf(output_v, + // Q6_Vhf_equals_Wqf32(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + /* Store results to the output buffer and convert from qf16 to hf */ + *output_v_ptr++ = Q6_Vhf_equals_Wqf32(output_dv.VV); + // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16); + // *output_v_ptr++ = Q6_Vhf_equals_Vqf16(output_v); + + /* Prepare slinep for next iteration */ + slinep = slinec; + } + } + + /* Handle last whole vector from input data */ + if (vectors_in_rounddown > 0) { + slinec = + is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : *input_v_ptr++; + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + tmp_v = Q6_Vh_vdeal_Vh(sline); + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + /* Convert from qf32 to hf, store output and go to handle leftover */ + *output_v_ptr++ = Q6_Vhf_equals_Wqf32(output_dv.VV); + + slinep = slinec; + } + + /* Handle leftover elements */ + if (leftover > 0) { + slinec = + (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep + : *input_v_ptr++); + + sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input); + tmp_v = Q6_Vh_vdeal_Vh(sline); + /* Shift input range from [input_min, input_max] to [0, input_max - + * input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = + Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), + Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), + Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = + Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = + Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + /* Convert from qf16 to hf */ + sout = Q6_Vhf_equals_Wqf32(output_dv.VV); + + /* Store output */ + vstu_variable(output_v_ptr, leftover_size, sout); + } + + return 0; +} + +#endif + +template +GraphStatus siluImpl(TensorType &out_0, const TensorType &in_0) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + +#ifdef REFERENCE_OP + debuglog("silu execute... inval=(%d)", in_0.get_dtype()); + debuglog("silu execute... inval=(%d)", out_0.get_dtype()); + + out_0.set_dims(in_0); + // NHWC + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // SiLU + for (Idx d = 0; d < d_in; d++) { + float inval = in_0(b, h, w, d); + float outval = 1 / (1 + expf(-inval)); + + debuglog("silu execute... inval=(%f)", inval); + debuglog("silu execute... outval=(%f)", outval); + + out_0(b, h, w, d) = inval * outval; + } + } + } + } + +#else + + // HVX Method -- FP32 Version + out_0.set_dims(in_0); + + DType dtype = in_0.get_dtype(); + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + size_t size = b_in * h_in * w_in * d_in; + + // Noticable size >= 128 + + // SiLU inval / (1 + expf(-inval)); + // sigmod 1.0/(exp(-x)+1.0) + // SiLU inval * sigmod + + if (dtype == DType::Float16) { + + // NHWC + auto in_ptr = (__fp16 *)in_0.raw_data_const(); + auto out_ptr = (__fp16 *)out_0.raw_data(); + hvx_silu_ahf(in_ptr, out_ptr, size); + + } else { + // NHWC + auto in_ptr = (float *)in_0.raw_data_const(); + auto out_ptr = (float *)out_0.raw_data(); + hvx_silu_af(in_ptr, out_ptr, size); + } + + return GraphStatus::Success; + +#endif + +#ifdef DEBUG + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // SiLU + for (Idx d = 0; d < d_in; d++) { + float out_value = out_0(b, h, w, d); + debuglog("silu execute... outval=(%f)", out_value); + } + } + } + } + +#endif + + return GraphStatus::Success; +} + +__attribute__((unused)) static float siluCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_SiLU); diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp new file mode 100644 index 000000000..832420ca1 --- /dev/null +++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp @@ -0,0 +1,154 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_SplitInput); + +// op execute function declarations +template +GraphStatus splitinputImpl(TensorType &out_0, TensorType &out_1, + const TensorType &in_0, const TensorType1 &in_1, + const Tensor &num); + +// forward declaration of sample cost function +static float splitinputCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((splitinputImpl), "SplitInput") + */ +DEF_PACKAGE_OP((splitinputImpl), "SplitInput") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((splitinputImpl), "SplitInput", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((splitinputImpl), "SplitInput", splitinputCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +template +GraphStatus splitinputImpl(TensorType &out_0, TensorType &out_1, + const TensorType &in_0, const TensorType1 &in_1, + const Tensor &num) { + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + // default is two. + + size_t o_size = in_1(0, 0, 0, 0); + size_t x_size = in_1(0, 0, 0, 1); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + + const size_t dims_0[] = {b_in, o_size, w_in, d_in}; + const size_t dims_1[] = {b_in, x_size, w_in, d_in}; + + out_0.set_dims(dims_0); + out_1.set_dims(dims_1); + + DType dtype = in_0.get_dtype(); + uint32_t bitwidth = 4; + + if (dtype == DType::QUInt8 || dtype == DType::QInt8) { + + bitwidth = 1; + + } else if (dtype == DType::Float16) { + + bitwidth = 2; + } else if (dtype == DType::Float32) { + + bitwidth = 4; + } + + const uint8_t *in_ptr = (uint8_t *)in_0.raw_data_const(); + + uint8_t *out_ptr_0 = (uint8_t *)out_0.raw_data(); + uint8_t *out_ptr_1 = (uint8_t *)out_1.raw_data(); + + memcpy(out_ptr_0, in_ptr, b_in * o_size * w_in * d_in * bitwidth); + in_ptr += b_in * o_size * w_in * d_in * bitwidth; + + memcpy(out_ptr_1, in_ptr, b_in * x_size * w_in * d_in * bitwidth * 4); + + return GraphStatus::Success; +} + +__attribute__((unused)) static float splitinputCostFunc(const Op *op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_SplitInput); diff --git a/nntrainer/npu/qnn/Model/QnnModel.cpp b/nntrainer/npu/qnn/Model/QnnModel.cpp new file mode 100644 index 000000000..41c368154 --- /dev/null +++ b/nntrainer/npu/qnn/Model/QnnModel.cpp @@ -0,0 +1,668 @@ +//============================================================================== +// +// Copyright (c) 2019-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include +#include +#include +#include + +#include "QnnModel.hpp" +#include "QnnModelPal.hpp" +#include "QnnTypeMacros.hpp" + +#define FREE_MEMORY(ptr1, ptr2, ptr3) \ + do { \ + free(ptr1); \ + free(ptr2); \ + free(ptr3); \ + } while (0) + +namespace qnn_wrapper_api { + +ModelError_t QnnModel::initialize(const Qnn_BackendHandle_t &backendHandle, + const QNN_INTERFACE_VER_TYPE &qnnInterface, + const Qnn_ContextHandle_t &context, + const char *graphName, bool debug, + uint8_t doNodeValidations, + const QnnGraph_Config_t **graphConfigs) { + if (backendHandle == nullptr) { + PRINT_ERROR("QnnModel::initialize() nullptr passed as backend handle."); + return MODEL_CONTEXT_ERROR; + } + if (context == nullptr) { + PRINT_ERROR("QnnModel::initialize() nullptr passed as context handle."); + return MODEL_CONTEXT_ERROR; + } + if (graphName == nullptr) { + PRINT_ERROR("QnnModel::initialize() nullptr passed as graphName."); + return MODEL_GRAPH_ERROR; + } + + if (!m_graphName.empty()) { + // only one graph is allowed per QnnModel + PRINT_ERROR( + "QnnModel::initialize() model for graph %s already initialized.", + graphName); + return MODEL_GRAPH_ERROR; + } + + if (!m_doNodeValidations) { + PRINT_WARNING("Node validation disabled. Backend will not perform op " + "validation prior to adding Node. \n"); + } + + m_qnnInterface = qnnInterface; + m_backendHandle = backendHandle; + m_graphName = graphName; + m_debug = debug; + m_doNodeValidations = doNodeValidations; + + if (m_qnnInterface.graphCreate(context, graphName, graphConfigs, &m_graph) != + QNN_GRAPH_NO_ERROR || + m_graph == nullptr) { + PRINT_ERROR( + "QnnModel::initialize() not able to create graph in given context."); + return MODEL_GRAPH_ERROR; + } + + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::addTensor(const char *nodeName, Qnn_Tensor_t *tensor, + bool saveTensor) { + ModelError_t err; + if (!tensor) { + PRINT_ERROR("QnnModel::addTensor() NULL tensor pointer provided.\n"); + return MODEL_TENSOR_ERROR; + } + VALIDATE_TENSOR_VERSION((*tensor), err); + + // Verify tensor being added is not a duplicate + std::string mapEntry = std::string(QNN_TENSOR_GET_NAME(tensor)); + if (m_modelTensorsMap.find(mapEntry) != m_modelTensorsMap.end()) { + PRINT_ERROR("QnnModel::addTensor() creating tensor %s for node %s. Tensor " + "already exists.\n", + mapEntry.c_str(), nodeName); + + return MODEL_TENSOR_ERROR; + } + + const std::map dataTypeToSize = { + {QNN_DATATYPE_INT_8, 1}, {QNN_DATATYPE_INT_16, 2}, + {QNN_DATATYPE_INT_32, 4}, {QNN_DATATYPE_INT_64, 8}, + {QNN_DATATYPE_UINT_8, 1}, {QNN_DATATYPE_UINT_16, 2}, + {QNN_DATATYPE_UINT_32, 4}, {QNN_DATATYPE_UINT_64, 8}, + {QNN_DATATYPE_FLOAT_16, 2}, {QNN_DATATYPE_FLOAT_32, 4}, + {QNN_DATATYPE_BOOL_8, 1}, {QNN_DATATYPE_SFIXED_POINT_8, 1}, + {QNN_DATATYPE_SFIXED_POINT_16, 2}, {QNN_DATATYPE_SFIXED_POINT_32, 4}, + {QNN_DATATYPE_UFIXED_POINT_8, 1}, {QNN_DATATYPE_UFIXED_POINT_16, 2}, + {QNN_DATATYPE_UFIXED_POINT_32, 4}, + }; + + if (dataTypeToSize.find(QNN_TENSOR_GET_DATA_TYPE(tensor)) == + dataTypeToSize.end()) { + PRINT_ERROR("QnnModel::addTensor() invalid QNN data type provided, %u, for " + "tensor %s on node %s\n", + QNN_TENSOR_GET_DATA_TYPE(tensor), QNN_TENSOR_GET_NAME(tensor), + nodeName); + return MODEL_TENSOR_ERROR; + } + + // sanity check tensor data if addTensor used for static tensor + if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_STATIC) { + if (QNN_TENSOR_GET_MEM_TYPE(tensor) != QNN_TENSORMEMTYPE_RAW) { + PRINT_ERROR("QnnModel::addTensor(): Expected raw memType in provided " + "static tensor %s for node %s", + mapEntry.c_str(), nodeName); + return MODEL_TENSOR_ERROR; + } + // verify size expressed by the dims matches the raw tensor size + uint32_t qnnTensorSize = std::accumulate( + QNN_TENSOR_GET_DIMENSIONS(tensor), + QNN_TENSOR_GET_DIMENSIONS(tensor) + QNN_TENSOR_GET_RANK(tensor), + (uint32_t)dataTypeToSize.find(QNN_TENSOR_GET_DATA_TYPE(tensor))->second, + std::multiplies()); + if (qnnTensorSize != QNN_TENSOR_GET_CLIENT_BUF(tensor).dataSize) { + PRINT_ERROR("QnnModel::addTensor(): Adding STATIC tensor, length " + "mismatch between clientBuf" + "size and tensor Dims(dim * rank * sizeof(datatype) for, " + "nodeName: %s, tensorName: %s." + "Got tensorSize: %d, tensor.clientBuf.dataSize: %d.\n", + nodeName, QNN_TENSOR_GET_NAME(tensor), qnnTensorSize, + QNN_TENSOR_GET_CLIENT_BUF(tensor).dataSize); + return MODEL_TENSOR_ERROR; + } + } + + if (m_debug && QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_NATIVE) { + // for debug, make all tensors accessible by client + QNN_TENSOR_SET_TYPE(tensor, QNN_TENSOR_TYPE_APP_READ); + } + + if (m_qnnInterface.tensorCreateGraphTensor(m_graph, tensor) != + QNN_TENSOR_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addTensor() Creating tensor for node: %s, tensorName: %s.\n", + nodeName, QNN_TENSOR_GET_NAME(tensor)); + return MODEL_TENSOR_ERROR; + } + + if (saveTensor) { + Qnn_Tensor_t tensorCopy; + VALIDATE(deepCopyQnnTensors(*tensor, tensorCopy), err); + + // save network input/outputs tensors to use for setting the Qnn graph's + // input and output tensors for populating GraphInfo_t for caller + if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_APP_WRITE) { + m_modelInputTensors.push_back(tensorCopy); + } else if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_APP_READ) { + m_modelOutputTensors.push_back(tensorCopy); + } + + // save created tensors for later lookup to populate graph node construction + m_modelTensorsMap[mapEntry] = tensorCopy; + } + + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::addTensor(const char *nodeName, Qnn_Tensor_t tensor, + bool saveTensor) { + return addTensor(nodeName, &tensor, saveTensor); +} + +ModelError_t QnnModel::getQnnTensor(const char *&nodeName, + const char *&tensorName, + Qnn_Tensor_t &tensor) { + std::string mapEntry = std::string(tensorName); + if (m_modelTensorsMap.find(tensorName) == m_modelTensorsMap.end()) { + PRINT_ERROR("QnnModel::getQnnTensor() tensor %s not found on node %s\n", + mapEntry.c_str(), nodeName); + return MODEL_TENSOR_ERROR; + } + tensor = m_modelTensorsMap[mapEntry]; + + return MODEL_NO_ERROR; +} + +// overload for string tensorName +ModelError_t QnnModel::getQnnTensor(std::string nodeName, + std::string tensorName, + Qnn_Tensor_t &tensor) { + if (m_modelTensorsMap.find(tensorName) == m_modelTensorsMap.end()) { + PRINT_ERROR("QnnModel::getQnnTensor() tensor %s not found on node %s\n", + tensorName.c_str(), nodeName.c_str()); + return MODEL_TENSOR_ERROR; + } + tensor = m_modelTensorsMap[tensorName]; + + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::addNode(Qnn_OpConfigVersion_t version, const char *name, + const char *packageName, const char *type, + Qnn_Param_t *params, uint32_t numOfParams, + const char **inputNames, uint32_t numOfInputs, + Qnn_Tensor_t *outputTensors, + uint32_t numOfOutputs) { + ModelError_t nodeError; + Qnn_OpConfig_t opDefinition = QNN_OPCONFIG_INIT; + opDefinition.version = version; + VALIDATE_OP_CONFIG_VERSION((opDefinition), nodeError); + + // populate Qnn param for node + Qnn_Param_t *nodeParams = + (Qnn_Param_t *)malloc(numOfParams * sizeof(Qnn_Param_t)); + + // populate input tensors for node + Qnn_Tensor_t *inputs = + (Qnn_Tensor_t *)malloc(numOfInputs * sizeof(Qnn_Tensor_t)); + + // populate output tensors of node + Qnn_Tensor_t *outputs = + (Qnn_Tensor_t *)malloc(numOfOutputs * sizeof(Qnn_Tensor_t)); + + if (nodeParams == nullptr || inputs == nullptr || outputs == nullptr) { + PRINT_ERROR("QnnModel::addNode() failed for allocate memory for creating " + "QNN OpConfig for node %s.\n", + name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_MEMORY_ALLOCATE_ERROR; + } + uint32_t nodeParamsCounter = 0; + for (size_t i = 0; i < numOfParams; i++) { + switch (params[i].paramType) { + case QNN_PARAMTYPE_TENSOR: { + Qnn_Tensor_t &tensor = params[i].tensorParam; + // Note: set saveTensor to false as no need to save tensor beyond this + // function call for params + nodeError = addTensor(name, &tensor, false); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR("QnnModel::addNode() addTensor() failed for tensor param " + "%s on node %s.\n", + QNN_TENSOR_GET_NAME(tensor), name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_TENSOR; + nodeParams[nodeParamsCounter].name = params[i].name; + nodeParams[nodeParamsCounter++].tensorParam = tensor; + break; + } + case QNN_PARAMTYPE_SCALAR: { + nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_SCALAR; + nodeParams[nodeParamsCounter].name = params[i].name; + nodeParams[nodeParamsCounter++].scalarParam = params[i].scalarParam; + break; + } + default: { + PRINT_ERROR("QnnModel::addNode() unknown param type passed for param %s " + "on node %s.\n", + params[i].name, name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_PARAMS_ERROR; + } + } + } + + size_t inputsCounter = 0; + for (size_t j = 0; j < numOfInputs; j++) { + nodeError = getQnnTensor(name, inputNames[j], inputs[inputsCounter++]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n", + inputNames[j], name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + } + + size_t outputsCounter = 0; + m_modelOutputTensorMap[name] = {}; + for (size_t k = 0; k < numOfOutputs; k++) { + // create node output tensors first + nodeError = addTensor(name, outputTensors[k]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() addTensor() failed for tensor %s on node %s\n", + QNN_TENSOR_GET_NAME(outputTensors[k]), name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + const char *outTensorName = QNN_TENSOR_GET_NAME(outputTensors[k]); + m_modelOutputTensorMap[name].push_back(outTensorName); + nodeError = getQnnTensor(name, outTensorName, outputs[outputsCounter++]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n", + outTensorName, name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + } + + // define and add node to graph + QNN_OP_CFG_SET_NAME(opDefinition, name); + QNN_OP_CFG_SET_PACKAGE_NAME(opDefinition, packageName); + QNN_OP_CFG_SET_TYPE_NAME(opDefinition, type); + QNN_OP_CFG_SET_PARAMS(opDefinition, numOfParams, nodeParams); + QNN_OP_CFG_SET_INPUTS(opDefinition, numOfInputs, inputs); + QNN_OP_CFG_SET_OUTPUTS(opDefinition, numOfOutputs, outputs); + + if (m_doNodeValidations) { + auto validationStatus = + m_qnnInterface.backendValidateOpConfig(m_backendHandle, opDefinition); + if (validationStatus == QNN_BACKEND_ERROR_NOT_SUPPORTED) { + PRINT_DEBUG("QnnModel::addNode() validation API not supported.\n"); + } else if (validationStatus != QNN_SUCCESS) { + PRINT_ERROR("QnnModel::addNode() validating node %s failed.\n", name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_GRAPH_ERROR; + } + } + + if (m_qnnInterface.graphAddNode(m_graph, opDefinition) != + QNN_GRAPH_NO_ERROR) { + PRINT_ERROR("QnnModel::addNode() adding node %s failed.\n", name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_GRAPH_ERROR; + } + + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_NO_ERROR; +} + +// overload for string tensorName +ModelError_t QnnModel::addNode(Qnn_OpConfigVersion_t version, const char *name, + const char *packageName, const char *type, + Qnn_Param_t *params, uint32_t numOfParams, + std::vector inputNames, + uint32_t numOfInputs, + Qnn_Tensor_t *outputTensors, + uint32_t numOfOutputs) { + ModelError_t nodeError; + Qnn_OpConfig_t opDefinition = QNN_OPCONFIG_INIT; + opDefinition.version = version; + VALIDATE_OP_CONFIG_VERSION((opDefinition), nodeError); + + // populate Qnn param for node + Qnn_Param_t *nodeParams = + (Qnn_Param_t *)malloc(numOfParams * sizeof(Qnn_Param_t)); + + // populate input tensors for node + Qnn_Tensor_t *inputs = + (Qnn_Tensor_t *)malloc(numOfInputs * sizeof(Qnn_Tensor_t)); + + // populate output tensors of node + Qnn_Tensor_t *outputs = + (Qnn_Tensor_t *)malloc(numOfOutputs * sizeof(Qnn_Tensor_t)); + + if (nodeParams == nullptr || inputs == nullptr || outputs == nullptr) { + PRINT_ERROR("QnnModel::addNode() failed for allocate memory for creating " + "QNN OpConfig for node %s.\n", + name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_MEMORY_ALLOCATE_ERROR; + } + uint32_t nodeParamsCounter = 0; + for (size_t i = 0; i < numOfParams; i++) { + switch (params[i].paramType) { + case QNN_PARAMTYPE_TENSOR: { + Qnn_Tensor_t &tensor = params[i].tensorParam; + // Note: set saveTensor to false as no need to save tensor beyond this + // function call for params + nodeError = addTensor(name, &tensor, false); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR("QnnModel::addNode() addTensor() failed for tensor param " + "%s on node %s.\n", + QNN_TENSOR_GET_NAME(tensor), name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_TENSOR; + nodeParams[nodeParamsCounter].name = params[i].name; + nodeParams[nodeParamsCounter++].tensorParam = tensor; + break; + } + case QNN_PARAMTYPE_SCALAR: { + nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_SCALAR; + nodeParams[nodeParamsCounter].name = params[i].name; + nodeParams[nodeParamsCounter++].scalarParam = params[i].scalarParam; + break; + } + default: { + PRINT_ERROR("QnnModel::addNode() unknown param type passed for param %s " + "on node %s.\n", + params[i].name, name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_PARAMS_ERROR; + } + } + } + + size_t inputsCounter = 0; + for (size_t j = 0; j < numOfInputs; j++) { + nodeError = getQnnTensor(name, inputNames[j], inputs[inputsCounter++]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n", + inputNames[j].c_str(), name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + } + + size_t outputsCounter = 0; + m_modelOutputTensorMap[name] = {}; + for (size_t k = 0; k < numOfOutputs; k++) { + // create node output tensors first + nodeError = addTensor(name, outputTensors[k]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() addTensor() failed for tensor %s on node %s\n", + QNN_TENSOR_GET_NAME(outputTensors[k]), name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + const char *outTensorName = QNN_TENSOR_GET_NAME(outputTensors[k]); + m_modelOutputTensorMap[name].push_back(outTensorName); + nodeError = getQnnTensor(name, outTensorName, outputs[outputsCounter++]); + if (nodeError != MODEL_NO_ERROR) { + PRINT_ERROR( + "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n", + outTensorName, name); + FREE_MEMORY(nodeParams, inputs, outputs); + return nodeError; + } + } + + // define and add node to graph + QNN_OP_CFG_SET_NAME(opDefinition, name); + QNN_OP_CFG_SET_PACKAGE_NAME(opDefinition, packageName); + QNN_OP_CFG_SET_TYPE_NAME(opDefinition, type); + QNN_OP_CFG_SET_PARAMS(opDefinition, numOfParams, nodeParams); + QNN_OP_CFG_SET_INPUTS(opDefinition, numOfInputs, inputs); + QNN_OP_CFG_SET_OUTPUTS(opDefinition, numOfOutputs, outputs); + + if (m_doNodeValidations) { + auto validationStatus = + m_qnnInterface.backendValidateOpConfig(m_backendHandle, opDefinition); + if (validationStatus == QNN_BACKEND_ERROR_NOT_SUPPORTED) { + PRINT_DEBUG("QnnModel::addNode() validation API not supported.\n"); + } else if (validationStatus != QNN_SUCCESS) { + PRINT_ERROR("QnnModel::addNode() validating node %s failed.\n", name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_GRAPH_ERROR; + } + } + + if (m_qnnInterface.graphAddNode(m_graph, opDefinition) != + QNN_GRAPH_NO_ERROR) { + PRINT_ERROR("QnnModel::addNode() adding node %s failed.\n", name); + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_GRAPH_ERROR; + } + + FREE_MEMORY(nodeParams, inputs, outputs); + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::freeCachedTensors() { + ModelError_t err = MODEL_NO_ERROR; + + // cleanup cached tensors + for (std::map::iterator tensorIt = + m_modelTensorsMap.begin(); + tensorIt != m_modelTensorsMap.end();) { + Qnn_Tensor_t &tensor = tensorIt->second; + if (QNN_TENSOR_GET_TYPE(tensor) != QNN_TENSOR_TYPE_APP_WRITE && + QNN_TENSOR_GET_TYPE(tensor) != QNN_TENSOR_TYPE_APP_READ) { + VALIDATE(freeQnnTensor(tensor), err); + tensorIt = m_modelTensorsMap.erase(tensorIt); + } else { + tensorIt++; + } + } + + return err; +} + +ModelError_t QnnModel::finalize(Qnn_ProfileHandle_t profile, + Qnn_SignalHandle_t signal) { + ModelError_t err; + + // finalize the graph + if (m_qnnInterface.graphFinalize(m_graph, profile, signal) != + QNN_GRAPH_NO_ERROR) { + PRINT_ERROR("QnnModel::finalize() finalizing graph failed.\n"); + return MODEL_GRAPH_ERROR; + } + + VALIDATE(freeCachedTensors(), err); + + return err; +} + +ModelError_t getGraphInfoFromModels(QnnModel *models, uint32_t numModels, + GraphInfoPtr_t **graphsInfo) { + ModelError_t err = MODEL_NO_ERROR; + if (models == nullptr || graphsInfo == nullptr || numModels <= 0) { + PRINT_ERROR("getGraphInfoFromModels() models and graphsInfo uninitialized " + "or number of models is " + "<= 0.\n"); + return MODEL_GRAPH_ERROR; + } + + *graphsInfo = (GraphInfo_t **)malloc(numModels * sizeof(GraphInfo_t *)); + if (*graphsInfo == nullptr) { + PRINT_ERROR( + "getGraphInfoFromModels() graphsInfo malloc returned nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + GraphInfo_t *graphArr = + (GraphInfo_t *)malloc(numModels * sizeof(GraphInfo_t)); + if (graphArr == nullptr) { + PRINT_ERROR("getGraphInfoFromModels() graphArr malloc returned nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + for (uint32_t i = 0; i < numModels; i++) { + QnnModel &model = models[i]; + graphArr[i].graph = model.getQnnGraph(); + graphArr[i].graphName = + strnDup(model.getQnnGraphName().c_str(), model.getQnnGraphName().size()); + if (graphArr[i].graphName == nullptr) { + PRINT_ERROR("getGraphInfoFromModels() failed to construct graphName. " + "Received nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + // allocate and add graph input/output TensorsWrapper. Note: no need to make + // deep copies of the tensor's pointer members as they are already allocated + // on heap in the addTensor function call. + std::vector graphInputTensors = model.getGraphInputTensors(); + size_t numInputTensors = graphInputTensors.size(); + size_t inputTensorsSize = numInputTensors * sizeof(Qnn_Tensor_t); + graphArr[i].inputTensors = (Qnn_Tensor_t *)malloc(inputTensorsSize); + memscpy(graphArr[i].inputTensors, inputTensorsSize, + graphInputTensors.data(), inputTensorsSize); + graphArr[i].numInputTensors = (uint32_t)numInputTensors; + // allocate and add graph outputTensors + std::vector graphOutputTensors = + model.getGraphOutputTensors(); + size_t numOutputTensors = graphOutputTensors.size(); + size_t outputTensorsSize = numOutputTensors * sizeof(Qnn_Tensor_t); + graphArr[i].outputTensors = (Qnn_Tensor_t *)malloc(outputTensorsSize); + memscpy(graphArr[i].outputTensors, outputTensorsSize, + graphOutputTensors.data(), outputTensorsSize); + graphArr[i].numOutputTensors = (uint32_t)numOutputTensors; + + // have return object point to the populated graph struct + (*graphsInfo)[i] = graphArr + i; + + // graph composition is complete by this stage, free if any cached tensors + // remaining + VALIDATE(model.freeCachedTensors(), err); + } + + return err; +} + +ModelError_t getSingleGraphInfoFromModel(QnnModel &model, + GraphInfoPtr_t *graphInfoPtr) { + ModelError_t err = MODEL_NO_ERROR; + + *graphInfoPtr = (GraphInfo_t *)malloc(sizeof(GraphInfo_t)); + auto graphInfo = *graphInfoPtr; + if (graphInfo == nullptr) { + PRINT_ERROR( + "getGraphInfoFromModels() graphsInfo malloc returned nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + graphInfo->graph = model.getQnnGraph(); + graphInfo->graphName = + strnDup(model.getQnnGraphName().c_str(), model.getQnnGraphName().size()); + if (graphInfo->graphName == nullptr) { + PRINT_ERROR("getGraphInfoFromModels() failed to construct graphName. " + "Received nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + // allocate and add graph input/output TensorsWrapper. Note: no need to make + // deep copies of the tensor's pointer members as they are already allocated + // on heap in the addTensor function call. + std::vector graphInputTensors = model.getGraphInputTensors(); + size_t numInputTensors = graphInputTensors.size(); + size_t inputTensorsSize = numInputTensors * sizeof(Qnn_Tensor_t); + graphInfo->inputTensors = (Qnn_Tensor_t *)malloc(inputTensorsSize); + memscpy(graphInfo->inputTensors, inputTensorsSize, graphInputTensors.data(), + inputTensorsSize); + graphInfo->numInputTensors = (uint32_t)numInputTensors; + // allocate and add graph outputTensors + std::vector graphOutputTensors = model.getGraphOutputTensors(); + size_t numOutputTensors = graphOutputTensors.size(); + size_t outputTensorsSize = numOutputTensors * sizeof(Qnn_Tensor_t); + graphInfo->outputTensors = (Qnn_Tensor_t *)malloc(outputTensorsSize); + memscpy(graphInfo->outputTensors, outputTensorsSize, + graphOutputTensors.data(), outputTensorsSize); + graphInfo->numOutputTensors = (uint32_t)numOutputTensors; + + // graph composition is complete by this stage, free if any cached tensors + // remaining + VALIDATE(model.freeCachedTensors(), err); + return err; +} + +ModelError_t freeGraphsInfo(GraphInfoPtr_t **graphsInfo, uint32_t numGraphs) { + if (graphsInfo == nullptr || *graphsInfo == nullptr) { + PRINT_ERROR("freeGraphsInfo() invalid graphsInfo."); + return MODEL_TENSOR_ERROR; + } + for (uint32_t i = 0; i < numGraphs; i++) { + PRINT_INFO("Freeing graph in freeGraphInfo"); + free((*graphsInfo)[i]->graphName); + freeQnnTensors((*graphsInfo)[i]->inputTensors, + (*graphsInfo)[i]->numInputTensors); + freeQnnTensors((*graphsInfo)[i]->outputTensors, + (*graphsInfo)[i]->numOutputTensors); + } + + free(**graphsInfo); + free(*graphsInfo); + *graphsInfo = nullptr; + + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::freeTensors() { + + for (std::map::iterator tensorIt = + m_modelTensorsMap.begin(); + tensorIt != m_modelTensorsMap.end();) { + Qnn_Tensor_t &tensor = tensorIt->second; + + tensorIt = m_modelTensorsMap.erase(tensorIt++); + } + + return MODEL_NO_ERROR; +} + +ModelError_t QnnModel::clearGraph() { + + m_modelInputTensors.resize(0); + m_modelOutputTensors.resize(0); + + m_modelOutputTensorMap.clear(); + m_graphName.clear(); + + return MODEL_NO_ERROR; +} + +} // namespace qnn_wrapper_api diff --git a/nntrainer/npu/qnn/Model/QnnModel.hpp b/nntrainer/npu/qnn/Model/QnnModel.hpp new file mode 100644 index 000000000..288dc9075 --- /dev/null +++ b/nntrainer/npu/qnn/Model/QnnModel.hpp @@ -0,0 +1,280 @@ +//============================================================================== +// +// Copyright (c) 2019-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#pragma once + +#include +#include +#include +#include + +#include "QnnInterface.h" +#include "QnnLog.h" +#include "QnnModelPal.hpp" +#include "WrapperUtils/QnnWrapperUtils.hpp" + +namespace qnn_wrapper_api { + +class QnnModel { +public: + ~QnnModel() = default; + + /** + * @brief Creates a Qnn Graph within given context. + * + * @param[in] backendHandle A handle to the QNN backend handle which will be + * used to query the API symbols + * + * @param[in] qnnInterface the QNN backend interface to use + * + * @param[in] context A handler to the context where the model's graph would + * be created. + * + * @param[in] graphName The name to use for creating a graph in the context + * provided. + * + * @param[in] debug If flag is true, sets all tensors created in model to be + * QNN_TENSOR_TYPE_APP_READ, essentially overwriting what + * is set in Qnn_TensorType. + * + * @param[in] doNodeValidations If flag is set, all nodes added with addNode + * call will be validated by Backend + * + * @param[in] graphConfigs Array of graph configurations to use for creating + * the QNN Graph. Default: nullptr + * + */ + ModelError_t initialize(const Qnn_BackendHandle_t &backendHandle, + const QNN_INTERFACE_VER_TYPE &qnnInterface, + const Qnn_ContextHandle_t &context, + const char *graphName, bool debug, + uint8_t doNodeValidations = 1, + const QnnGraph_Config_t **graphConfigs = nullptr); + + /** + * @brief A wrapper function to create a tensor inside class's context graph. + * + * @param[in] nodeName Lookup name for node/layer + * + * @param[in] tensor A pointer to a struct containing information on the + * tensor + * + * @param[in] saveTensor Flag to indicate if tensor should be saved in object + * for later retrieval with class getter functions. + * + * @return Error code + * + */ + ModelError_t addTensor(const char *nodeName, Qnn_Tensor_t *tensor, + bool saveTensor = true); + + /** + * @brief A wrapper function to create a tensor inside class's context graph. + * + * @param[in] nodeName Lookup name for node/layer + * + * @param[in] tensor A struct containing information on the tensor + * + * @param[in] saveTensor Flag to indicate if tensor should be saved in object + * for later retrieval with class getter functions. + * + * @return Error code + * + */ + ModelError_t addTensor(const char *nodeName, Qnn_Tensor_t tensor, + bool saveTensor = true); + + /** + * @brief function to be used to query tensors created within this QnnModel + * instance + * + * @param[in] nodeName Lookup name for node/layer + * + * @param[in] tensorName Lookup name for tensor + * + * @param[out] tensor The corresponding Qnn_Tensor_t object for given tensor + * name. + * + * @return Error code + * + */ + ModelError_t getQnnTensor(const char *&nodeName, const char *&tensorName, + Qnn_Tensor_t &tensor); + ModelError_t getQnnTensor(std::string nodeName, std::string tensorName, + Qnn_Tensor_t &tensor); + + /** + * @brief A wrapper function to create a node in class's graph. + * + * @param[in] version The QNN version for Op_Config_t structure to use (e.g. + * QNN_OPCONFIG_VERSION_1) + * + * @param[in] name The node name to use (e.g. my_graph_conv_1) + * + * @param[in] packageName The node package name (e.g. qti.aisw) + * + * @param[in] type The QNN_OP_QNN_OP_H node type (e.g. QNN_OP_ARGMAX) + * + * @param[in] params A struct object containing all the params for the node to + * be added. For tensorParam case. The tensor will be created within the + * function and the data will be retrieved from the binary blob to set the + * tensor data. + * + * @param[in] numOfParams The number of elements in above params object + * + * @param[in] inputNames List of tensor names for inputs to node. Note: the + * corresponding qnn tensor objects must be created within this instance prior + * to being listed as input to a node + * + * @param[in] numOfInputs The number of elements in above inputNames object + * + * @param[in] outputTensors List of Qnn_Tensor_t objects for outputs from + * node. Note1: the corresponding qnn tensor objects will be created in + * function and must not already exist. Note2: the output names must be unique + * per graph + * + * @param[in] numOfOutputs The number of elements in above outputs object + * + * @return Error code + * + */ + ModelError_t addNode(Qnn_OpConfigVersion_t version, const char *name, + const char *packageName, const char *type, + Qnn_Param_t *params, uint32_t numOfParams, + const char **inputNames, uint32_t numOfInputs, + Qnn_Tensor_t *outputTensors, uint32_t numOfOutputs); + // overload for vector of inputNames + ModelError_t addNode(Qnn_OpConfigVersion_t version, const char *name, + const char *packageName, const char *type, + Qnn_Param_t *params, uint32_t numOfParams, + std::vector inputNames, + uint32_t numOfInputs, Qnn_Tensor_t *outputTensors, + uint32_t numOfOutputs); + + /** + * @brief A wrapper function to return model's graph + * + * @return The Qnn graph object + * + */ + Qnn_GraphHandle_t getQnnGraph() { return m_graph; } + + /** + * @brief A wrapper function to return model's graphName + * + * @return The Qnn graph object's name + * + */ + std::string getQnnGraphName() { return m_graphName; } + + /** + * @brief A wrapper function to return model's graph input tensors + * + * @return vector of Qnn_Tensor_t objects + * + */ + std::vector getGraphInputTensors() { + return m_modelInputTensors; + } + + /** + * @brief A wrapper function to return model's graph output tensors + * + * @return vector of Qnn_Tensor_t objects + * + */ + std::vector getGraphOutputTensors() { + return m_modelOutputTensors; + } + + /** + * @brief A wrapper function to return graph's output tensors->op mapping + * + * @return map of std::string, std::vector + * + */ + std::map> getOutputTensorMap() { + return m_modelOutputTensorMap; + } + + /** + * @brief A wrapper function to finalize model's graph which includes calling + * backend finalize on graph. + * + * @return Error code + * + */ + ModelError_t finalize(Qnn_ProfileHandle_t profile = nullptr, + Qnn_SignalHandle_t signal = nullptr); + + /** + * @brief Removes saved Qnn_Tensor_t objects and frees memory + * Note: Cleanup doesnt apply to input/output tensors as they are + * needed beyond this class finishes graph construction for the execute call. + * User of this API is expected to free those. + * + * @return Error code + */ + ModelError_t freeCachedTensors(); + + ModelError_t freeTensors(); + + ModelError_t clearGraph(); + +private: + Qnn_GraphHandle_t m_graph = nullptr; + std::string m_graphName; + bool m_debug = + false; // flag to indicate if requested graph is to be run in debug mode + // (i.e. all intermediate tensors will be accessible to client) + // flag to indicate whether all addNode calls need to be validated + bool m_doNodeValidations = true; + + std::vector m_modelInputTensors; + std::vector m_modelOutputTensors; + // keeps track of graph tensors to enable creating Qnn nodes from tensor names + std::map m_modelTensorsMap; + std::map> m_modelOutputTensorMap; + + // Qnn Backend Interface Api + QNN_INTERFACE_VER_TYPE m_qnnInterface; + Qnn_BackendHandle_t m_backendHandle; + +}; // QNN_MODEL_CLASS + +/** + * @brief A helper function to convert QnnModel objects to Graph struct for + * qnn_model c interface + * @param[in] models List of QnnModel objects + * @param[in] numModels The number of elements in above models object + * + * @param[out] graphsInfo The corresponding array of Graph object for each of + * the above model objects(note: this function will malloc memory needed to + * store the struct objects. Following free shall be invoked when objects are no + * longer needed. + * + * @return Error code + * + */ +ModelError_t getGraphInfoFromModels(QnnModel *models, uint32_t numModels, + GraphInfoPtr_t **graphsInfo); +ModelError_t getSingleGraphInfoFromModel(QnnModel &model, + GraphInfoPtr_t *graphInfoPtr); + +/** + * @brief A helper function to free memory malloced for communicating the Graph + * for a model(s) + * @param[in] graphsInfo Pointer pointing to location of graph objects + * @param[in] numGraphs The number of graph objects the above pointer is + * pointing to + * + * @return Error code + * + */ +ModelError_t freeGraphsInfo(GraphInfoPtr_t **graphsInfo, uint32_t numGraphs); +} // namespace qnn_wrapper_api diff --git a/nntrainer/npu/qnn/Model/QnnModelPal.cpp b/nntrainer/npu/qnn/Model/QnnModelPal.cpp new file mode 100644 index 000000000..7858b29de --- /dev/null +++ b/nntrainer/npu/qnn/Model/QnnModelPal.cpp @@ -0,0 +1,29 @@ +//============================================================================== +// +// Copyright (c) 2021-2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include +#include +#include + +#include "QnnModelPal.hpp" + +namespace qnn_wrapper_api { +void *dlSym(void *handle, const char *symbol) { + if (handle == DL_DEFAULT) { + return ::dlsym(RTLD_DEFAULT, symbol); + } + + return ::dlsym(handle, symbol); +} + +char *dlError(void) { return ::dlerror(); } + +char *strnDup(const char *source, size_t maxlen) { + return ::strndup(source, maxlen); +} +} // namespace qnn_wrapper_api diff --git a/nntrainer/npu/qnn/Model/QnnModelPal.hpp b/nntrainer/npu/qnn/Model/QnnModelPal.hpp new file mode 100644 index 000000000..e72273f43 --- /dev/null +++ b/nntrainer/npu/qnn/Model/QnnModelPal.hpp @@ -0,0 +1,54 @@ +//============================================================================== +// +// Copyright (c) 2021-2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +namespace qnn_wrapper_api { + +// specify this address to distingiush from NULL pointer +#define DL_DEFAULT (void *)(0x4) + +//--------------------------------------------------------------------------- +/// @brief +/// obtain address of a symbol in a shared object or executable +/// @handle +/// a handle of a dynamic loaded shared object returned by dlopen +/// @symbol +/// a null-terminated symbol name +/// @return +/// On success, return the address associated with symbol +/// On error, NULL +//--------------------------------------------------------------------------- +void *dlSym(void *handle, const char *symbol); + +//--------------------------------------------------------------------------- +/// @brief +/// obtain error diagnostic for functions in the dl-family APIs. +/// @return +/// returns a human-readable, null-terminated string describing the most +/// recent error that occurred from a call to one of the functions in the +/// dl-family APIs. +/// +//--------------------------------------------------------------------------- +char *dlError(void); + +//--------------------------------------------------------------------------- +/// @brief +/// Returns a pointer to a null-terminated byte string, which contains copies +/// of at most maxlen bytes from the string pointed to by str. If the null +/// terminator is not encountered in the first maxlen bytes, it is added to +/// the duplicated string. +/// @source +/// Null-terminated source string. +/// @maxlen +/// Max number of bytes to copy from str +/// @return +/// A pointer to the newly allocated string, or a null pointer if an error +/// occurred. +/// +//--------------------------------------------------------------------------- +char *strnDup(const char *source, size_t maxlen); +} // namespace qnn_wrapper_api