diff --git a/nntrainer/npu/qnn/LLaMAPackage/Makefile b/nntrainer/npu/qnn/LLaMAPackage/Makefile
new file mode 100644
index 000000000..2e86f996f
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/Makefile
@@ -0,0 +1,360 @@
+
+#=============================================================================
+#  Copyright (c) 2023 Qualcomm Technologies, Inc.
+#  All Rights Reserved.
+#  Confidential and Proprietary - Qualcomm Technologies, Inc.
+#=============================================================================
+
+# users should provide locations for QNN_INCLUDE and HEXAGON_SDK_ROOT
+# export HEXAGON_SDK_ROOT = /path/to/hexagon-sdk
+
+# check all setup prerequisites if the command goal is not clean
+ifneq ($(MAKECMDGOALS),clean)
+ifndef QNN_INCLUDE
+$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN
+endif
+ifeq ($(wildcard $(QNN_INCLUDE)),)
+$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package")
+endif
+ifndef QNN_TARGET_LIB
+$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android
+endif
+ifeq ($(wildcard $(QNN_TARGET_LIB)),)
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages")
+endif
+endif
+
+ifndef HEXAGON_SDK_ROOT
+$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z")
+endif
+
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),)
+$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path")
+endif
+
+HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT))
+
+ifndef HEXAGON_TOOLS_ROOT
+HEXAGON_TOOLS_ROOT = $(DEFAULT_HEXAGON_TOOLS_ROOT)
+endif
+
+$(info "HEXAGON_TOOLS_ROOT is [${HEXAGON_TOOLS_ROOT}]" )
+
+ifndef V
+V = v75
+endif
+$(info "V is [${V}]" )
+ifndef BUILD
+BUILD = $(DEFAULT_BUILD)
+endif
+
+$(info "BUILD is [${BUILD}]" )
+
+QHL_DIR = $(HEXAGON_SDK_ROOT)/libs/qhl
+QHL_HVX_DIR = $(HEXAGON_SDK_ROOT)/libs/qhl_hvx
+COMPLETE_TOOLS_VERSION = $(shell basename $(HEXAGON_TOOLS_ROOT))
+TEMP_VAR = $(subst ., ,$(COMPLETE_TOOLS_VERSION))
+TOOLS_VERSION = $(word 1,$(TEMP_VAR))$(word 2,$(TEMP_VAR))
+BUILD_DIR = hexagon_$(BUILD)_toolv$(TOOLS_VERSION)_$(V)
+PREBUILT_DIR = hexagon_toolv$(TOOLS_VERSION)_v65
+
+$(info "TOOLS_VERSION is [${TOOLS_VERSION}]" )
+
+
+QHL_DIR_BIN = $(QHL_DIR)/$(BUILD_DIR)
+QHL_HVX_DIR_BIN = $(QHL_HVX_DIR)/$(BUILD_DIR)
+
+QHL_INC_DIRS := $(QHL_DIR)/inc/qhmath $(QHL_DIR)/inc/qhcomplex $(QHL_DIR)/inc/qhdsp $(QHL_DIR)/inc/qhblas
+# QHL_LIBS = $(QHL_DIR_BIN)/libqhdsp.a $(QHL_DIR_BIN)/libqhcomplex.a $(QHL_DIR_BIN)/libqhmath.a $(QHL_DIR_BIN)/libqhblas.a
+
+QHL_HVX_INC_DIRS := $(QHL_HVX_DIR)/inc/internal $(QHL_HVX_DIR)/inc/qhdsp_hvx $(QHL_HVX_DIR)/inc/qhblas_hvx
+# QHL_HVX_LIBS = $(QHL_HVX_DIR_BIN)/libqhdsp_hvx.a $(QHL_HVX_DIR_BIN)/libqhblas_hvx.a $(QHL_DIR_BIN)/libqhmath.a $(QHL_DIR_BIN)/libqhcomplex.a
+
+WORKER_POOL_INC := $(HEXAGON_SDK_ROOT)/libs/worker_pool/inc/ $(HEXAGON_SDK_ROOT)/incs/stddef/ $(HEXAGON_SDK_ROOT)/incs/
+WORKER_POOL_LIB := $(HEXAGON_SDK_ROOT)/libs/worker_pool/prebuilt/$(PREBUILT_DIR)/libworker_pool.a
+
+$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
+# Users should note that the tools version may change between hexagon sdk versions
+# Following combination of SDK and Tool version is supported
+HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/HexagonSDK/
+HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/HexagonSDK/
+HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/HexagonSDK/
+HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/HexagonSDK/
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_BASE)/HexagonSDK/
+HEXAGON_TOOLS_VERSION_V68 := 8.4.09
+HEXAGON_TOOLS_VERSION_V69 := 8.5.03
+HEXAGON_TOOLS_VERSION_V73 := 8.7.06
+HEXAGON_TOOLS_VERSION_V75 := 8.7.06
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_TOOLS_VERSION_X86 := 8.7.06
+
+ifndef ANDROID_NDK_ROOT
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+endif
+endif
+
+ifndef PACKAGE_NAME
+export
+PACKAGE_NAME := $(notdir $(shell pwd))
+$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name")
+endif
+
+WORK := build
+SRC_DIR := src
+OP_SRC_DIR := src/ops
+OP_INCLUDE_DIR := ./include
+OP_INCLUDES =  #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags
+LIBRARY_NAME := libQnn$(PACKAGE_NAME).so
+SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 aarch64-android
+
+INCLUDES = $(addprefix -I,$(QHL_INC_DIRS)) $(addprefix -I,$(QHL_HVX_INC_DIRS)) $(addprefix -I,$(WORKER_POOL_INC)) -I$(HEXAGON_SDK_BASE)/HexagonSDK/libs/qhl_hvx/inc/qhmath_hvx/
+
+
+COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -fno-builtin -Wno-unused-function
+COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++
+COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))"  -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))"
+
+X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools
+
+# Ensure hexagon sdk tool version can be retrieved
+ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),)
+$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR).  \
+         \
+         Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)")
+endif
+
+#Check tools for hexagon_v75 are present.
+ifeq ($(MAKECMDGOALS),htp_v75)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)")
+endif
+endif
+
+#Check tools for hexagon_v68 are present.
+ifeq ($(MAKECMDGOALS),htp_v68)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v69)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v73)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)")
+endif
+endif
+
+endif
+OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp)
+OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp)
+HFILES = $(wildcard $(QNN_INCLUDE)/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h)
+OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES)))
+OTHER_OBJS =  $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES)))
+
+#======= Assembly ========
+OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S)
+OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86))))
+OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S)
+OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68))))
+OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S)
+OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69))))
+OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S)
+OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73))))
+OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S)
+OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75))))
+OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S)
+OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID))))
+
+$(info "ASSEMBLIES : $(OP_SOURCES_ASM_ANDROID), $(OP_SOURCES_ASM_V75), $(OP_SOURCES_ASM_V68)")
+
+all: htp_v68 htp_x86 htp_aarch64
+
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for x86
+X86_CXX ?= clang++
+X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread
+X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX
+X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof
+linux_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for hexagon
+HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED
+HEXAGON_CXX_FLAGS += -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
+
+HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix
+HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix
+HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix
+HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix
+
+HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++
+
+HEX_LDFLAGS =
+hexagon_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for aarch64
+AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID
+AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof  -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers
+ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++
+AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS)
+AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare
+aarch64_objs =
+#============================================================================================================
+# Setup targets and goals
+
+htp_x86: X86_BUILD
+
+htp_v68: HEXAGON_BUILD_V68
+
+htp_v69: HEXAGON_BUILD_V69
+
+htp_v73: HEXAGON_BUILD_V73
+
+htp_v75: HEXAGON_BUILD_V75
+
+htp_aarch64: AARCH64_BUILD
+
+AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V68:  $(WORK)/hexagon-v68/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V69:  $(WORK)/hexagon-v69/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V73:  $(WORK)/hexagon-v73/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V75:  $(WORK)/hexagon-v75/$(LIBRARY_NAME)
+
+X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
+
+
+define build_objs =
+ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),)
+$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x))
+else
+$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)")
+endif
+endef
+
+$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75))
+$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android))
+
+# x86
+$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/aarch64-android:
+	@mkdir -p $@/ops
+
+$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -DREFERENCE_OP -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES)
+	$(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS)
+
+# v68
+$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES)
+	$(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v69
+$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES)
+	$(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v73
+$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD $(INCLUDES) -DHVX_OP -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES)
+	$(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) $(WORKER_POOL_LIB)
+
+#v75
+$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD $(INCLUDES) -DHVX_OP -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES)
+	$(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) $(WORKER_POOL_LIB)
+
+# aarch64
+$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -DREFERENCE_OP -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES)
+	$(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS)
+
+clean:
+	-rm -rf $(WORK)
+
+.PHONY: all clean
+
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp
new file mode 100644
index 000000000..0b264e151
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/LLaMAPackageInterface.cpp
@@ -0,0 +1,400 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/QnnHtpCommon.h"
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "HTP/core/unique_types.h"
+#include "QnnOpPackage.h"
+#include "QnnSdkBuildId.h"
+
+DEFINE_UNIQ_TY()
+BEGIN_PKG_OPS_OPTS_LIST()
+
+/** Note that the order of declarations given here defines the order in which
+ * ops and graph optimizations are registered to the HTP Core. Append the latest
+ * OpName at the bottom
+ */
+DECLARE_PKG_OPS_OPTS_LIST(PKG_IRoPE)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMALinear)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_SplitInput)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAReLU)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMASuperSiLU)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAQuantize)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAMul)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_KVCache)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_Attention)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_QLayerNorm)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAAdd)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_CausalMask)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_HeadMatmul)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_RoPE)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMADequantize)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_WNop)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_MergeOutput)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_RMSNorm)
+DECLARE_PKG_OPS_OPTS_LIST(PKG_SiLU)
+
+END_PKG_OPS_OPTS_LIST()
+
+// op package info
+static constexpr auto sg_packageName =
+  THIS_PKG_NAME_STR; // package name passed in as compile flag
+
+static std::array<const char *, 19> sg_opNames{
+  {"IRoPE", "LLaMALinear", "SplitInput", "LLaMAReLU", "LLaMASuperSiLU",
+   "LLaMAQuantize", "LLaMAMul", "KVCache", "Attention", "QLayerNorm",
+   "LLaMAAdd", "CausalMask", "HeadMatmul", "RoPE", "LLaMADequantize", "WNop",
+   "MergeOutput", "RMSNorm", "SiLU"}};
+
+static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT;
+static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+
+// global data
+static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra =
+  nullptr; // global infrastructure not in use for now
+static bool sg_packageInitialized = false;
+
+/*
+ * user provided logging call back function
+ * currently only supported on linux x86-64 and nonrpc versions
+ * typedef void (*QnnLog_Callback_t)(const char* fmt,
+ *                                   QnnLog_Level_t level,
+ *                                   uint64_t timestamp,
+ *                                   va_list args);
+ * usage: if(sg_logInitialized && level <= sg_maxLogLevel)
+ *            sg_logCallback(fmt, level, timestamp, args);
+ *
+ * for cross rpc versions, skel side user provided logging call back function
+ * can be defined as part of op packages. maximal log level sg_maxLogLevel
+ * can be set by Qnn_ErrorHandle_t LLaMAPackageLogSetLevel(QnnLog_Level_t
+ * maxLogLevel)
+ */
+/*
+ * for alternative logging method provided by HTP core, please refer to log.h
+ */
+static QnnLog_Callback_t sg_logCallback =
+  nullptr; // user provided call back function pointer for logging
+static QnnLog_Level_t sg_maxLogLevel =
+  (QnnLog_Level_t)0; // maximal log level used in user provided logging
+static bool sg_logInitialized =
+  false; // tracks whether user provided logging method has been initialized
+
+/*
+ * op initialization
+ * needs to be global in the package
+ * one initialization per package before any op definitions
+ * syntax: INIT_PACKAGE_OP_DEF()
+ */
+INIT_PACKAGE_OP_DEF()
+
+/*
+ * optimization initialization
+ * needs to be global in the package
+ * one initialization per package before any optimization definitions
+ * syntax: INIT_PACKAGE_OPTIMIZATION_DEF()
+ */
+INIT_PACKAGE_OPTIMIZATION_DEF()
+
+/*
+ * op parameter order initialization
+ * needs to be global in the package
+ * one initialization per package before any op parameter order definitions
+ * syntax: INIT_PACKAGE_PARAM_ORDER_DEF()
+ */
+INIT_PACKAGE_PARAM_ORDER_DEF()
+
+/*
+ * axis parameter name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing axis parameter names passed into Qnn_AddNode API
+ * HTP backend auto-adjusts values in axis parameters based on HTP backfilling
+ * note: HTP backend backfills tensor dimensions to 4 dimensions
+ * syntax: LIST_PACKAGE_AXIS_PARAMS(...)
+ * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis")
+ */
+// LIST_PACKAGE_AXIS_PARAMS()
+
+/*
+ * per-channel quantized op name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing op names which support per-channel quantization
+ * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding
+ *   inside Qnn_Tensor_t types
+ * HTP backend only supports per-channel scale ops
+ *   i.e. along last dimension, offset is always zero
+ * if an op name is marked as having per-channel scale support, and in
+ *   QNN_AddNode, at least one input, parameter, or output has
+ *   QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type:
+ * then:
+ *   HTP backend will pass to op implementation function the following:
+ *     output(s), input(s), parameter(s),
+ *     outputPerChannelScale(s), inputPerChannelScale(s),
+ * paramPerChannelScale(s)
+ *
+ * optimization rules can be used to remove extra perChannelScale tensors
+ *
+ * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+ * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name)
+ */
+
+// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+/*
+ * Declare and define the special intialize function for HTP Backend to load
+ */
+INIT_PKG_CORE_INIT_FUNC()
+
+/* op package API's */
+
+Qnn_ErrorHandle_t
+LLaMAPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) {
+  if (sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+
+  /*
+   * op parameter order registration
+   * registers all defined op parameter orders in the package
+   * syntax: REGISTER_PACKAGE_PARAM_ORDERS()
+   */
+  REGISTER_PACKAGE_PARAM_ORDERS()
+
+  /*
+   * op axis parameter name registration
+   * registers all axis parameter names in the package
+   * used with LIST_PACKAGE_AXIS_PARAMS(...)
+   * syntax: REGISTER_PACKAGE_AXIS_PARAMS()
+   */
+  REGISTER_PACKAGE_AXIS_PARAMS()
+
+  /*
+   * per-channel scale op name registration
+   * registers all per-channel scale op names in the package
+   * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+   * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+   */
+  REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+  sg_globalInfra = infrastructure;
+  sg_packageInitialized = true;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t LLaMAPackageGetInfo(const QnnOpPackage_Info_t **info) {
+  if (!sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+  if (!info)
+    return QNN_OP_PACKAGE_ERROR_INVALID_INFO;
+
+  sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+  sg_packageInfo.packageName = sg_packageName;
+  sg_packageInfo.operationNames = sg_opNames.data();
+  sg_packageInfo.numOperations = sg_opNames.size();
+  sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
+  sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion;
+
+  *info = &sg_packageInfo;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t LLaMAPackageLogInitialize(QnnLog_Callback_t callback,
+                                            QnnLog_Level_t maxLogLevel) {
+  if (sg_logInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+  if (!callback)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  if (maxLogLevel < QNN_LOG_LEVEL_ERROR)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  sg_logCallback = callback;
+  sg_maxLogLevel = maxLogLevel;
+  sg_logInitialized = true;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t LLaMAPackageLogSetLevel(QnnLog_Level_t maxLogLevel) {
+  if (maxLogLevel < QNN_LOG_LEVEL_ERROR)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  sg_maxLogLevel = maxLogLevel;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t LLaMAPackageLogTerminate() {
+  if (!sg_logInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+  sg_logCallback = nullptr;
+  sg_maxLogLevel = (QnnLog_Level_t)0;
+  sg_logInitialized = false;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t LLaMAPackageValidateOpConfig(Qnn_OpConfig_t opConfig) {
+  if (std::string(sg_packageName) != opConfig.v1.packageName) {
+    return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+  }
+
+  /* auto-generated validation code below
+   * Check if op config type matches any registered ops
+   * If a match is found, check number of inputs, outputs and params
+   */
+  if (std::string(opConfig.v1.typeName) == "IRoPE") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMALinear") {
+    if (opConfig.v1.numOfParams != 4 || opConfig.v1.numOfInputs != 3 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "SplitInput") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 2) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMAReLU") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMASuperSiLU") {
+    if (opConfig.v1.numOfParams != 3 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMAQuantize") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMAMul") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "KVCache") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "Attention") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 5 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "QLayerNorm") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 3 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMAAdd") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "CausalMask") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "HeadMatmul") {
+    if (opConfig.v1.numOfParams != 2 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "RoPE") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "LLaMADequantize") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "WNop") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 2) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "MergeOutput") {
+    if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "RMSNorm") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else if (std::string(opConfig.v1.typeName) == "SiLU") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 1) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else {
+    return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+  }
+
+  /*
+   * additional validation code here
+   * */
+
+  return QNN_SUCCESS;
+}
+
+/* The following three functions in this comment are not called by HTP backend
+ *for now, no auto-generated implementations are created. Users should see
+ *example for full function signatures. (version 1.3.0) Qnn_ErrorHandle_t
+ *LLaMAPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t
+ * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t**
+ *kernels, uint32_t* numKernels) (version 1.3.0) Qnn_ErrorHandle_t
+ *LLaMAPackageFreeKernels (QnnOpPackage_Kernel_t* kernels)
+ *
+ * (version 1.4.0) Qnn_ErrorHandle_t LLaMAPackageCreateOpImpl
+ *(QnnOpPackage_GraphInfrastructure_t graphInfrastructure, QnnOpPackage_Node_t
+ *node, QnnOpPackage_OpImpl_t* opImpl) (version 1.4.0) Qnn_ErrorHandle_t
+ *LLaMAPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl)
+ */
+
+Qnn_ErrorHandle_t LLaMAPackageTerminate() {
+  if (!sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+
+  sg_globalInfra = nullptr;
+  sg_packageInitialized = false;
+  return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* latest version */
+Qnn_ErrorHandle_t
+LLaMAPackageInterfaceProvider(QnnOpPackage_Interface_t *interface) {
+  if (!interface)
+    return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT;
+  interface->interfaceVersion = {1, 4, 0};
+  interface->v1_4.init = LLaMAPackageInit;
+  interface->v1_4.terminate = LLaMAPackageTerminate;
+  interface->v1_4.getInfo = LLaMAPackageGetInfo;
+  interface->v1_4.validateOpConfig = LLaMAPackageValidateOpConfig;
+  interface->v1_4.createOpImpl = nullptr;
+  interface->v1_4.freeOpImpl = nullptr;
+  interface->v1_4.logInitialize = LLaMAPackageLogInitialize;
+  interface->v1_4.logSetLevel = LLaMAPackageLogSetLevel;
+  interface->v1_4.logTerminate = LLaMAPackageLogTerminate;
+  return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp
new file mode 100644
index 000000000..3b0d84f5d
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/CausalMask.cpp
@@ -0,0 +1,146 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+#define MASK_INFINITY 1e15
+
+BEGIN_PKG_OP_DEFINITION(PKG_CausalMask);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus causalmaskImpl(TensorType &out_0, const TensorType &in_0);
+
+// forward declaration of sample cost function
+static float causalmaskCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((causalmaskImpl<Tensor>), "CausalMask")
+ */
+DEF_PACKAGE_OP((causalmaskImpl<Tensor>), "CausalMask")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((causalmaskImpl<PlainFloatTensor>),
+ * "CausalMask", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((causalmaskImpl<PlainFloatTensor>),
+ * "CausalMask", causalmaskCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+template <typename TensorType>
+GraphStatus causalmaskImpl(TensorType &out_0, const TensorType &in_0)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+
+  int old_dim = 0;
+
+  // NHSD
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+  // S > 1 => mask
+  if (w_in > 1) {
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+          // CausalMask
+          for (Idx d = 0; d < d_in; d++) {
+
+            float in_value = in_0(b, h, w, d);
+
+            if (d > w + old_dim)
+              out_0(b, h, w, d) = in_value - MASK_INFINITY;
+            else
+              out_0(b, h, w, d) = in_value;
+          }
+        }
+      }
+    }
+  } else {
+    auto in_ptr = in_0.raw_data_const();
+    auto out_ptr = out_0.raw_data();
+    memcpy(out_ptr, in_ptr, b_in * h_in * w_in * d_in * 4);
+  }
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float causalmaskCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_CausalMask);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp
new file mode 100644
index 000000000..eeb83c00f
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/HeadMatmul.cpp
@@ -0,0 +1,164 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_HeadMatmul);
+
+static Qnn_Scalar_t sg_opDefaultTranspose_In0Scalar = {
+  .dataType = Qnn_DataType_t::QNN_DATATYPE_BOOL_8, .bool8Value = false};
+static Qnn_Param_t sg_opDefaultTranspose_In0 = {
+  .paramType = QNN_PARAMTYPE_SCALAR,
+  .scalarParam = sg_opDefaultTranspose_In0Scalar};
+static Qnn_Scalar_t sg_opDefaultTranspose_In1Scalar = {
+  .dataType = Qnn_DataType_t::QNN_DATATYPE_BOOL_8, .bool8Value = false};
+static Qnn_Param_t sg_opDefaultTranspose_In1 = {
+  .paramType = QNN_PARAMTYPE_SCALAR,
+  .scalarParam = sg_opDefaultTranspose_In1Scalar};
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus headmatmulImpl(TensorType &out_0, const TensorType &in_0,
+                           const TensorType &in_1,
+                           const QuantUint16Tensor &transpose_in0,
+                           const QuantUint16Tensor &transpose_in1);
+
+// forward declaration of sample cost function
+static float headmatmulCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((headmatmulImpl<Tensor>), "HeadMatmul")
+ */
+DEF_PACKAGE_OP((headmatmulImpl<Tensor>), "HeadMatmul")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((headmatmulImpl<PlainFloatTensor>),
+ * "HeadMatmul", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((headmatmulImpl<PlainFloatTensor>),
+ * "HeadMatmul", headmatmulCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+DEF_PACKAGE_PARAM_ORDER("HeadMatmul", "transpose_in0", false,
+                        &sg_opDefaultTranspose_In0, "transpose_in1", false,
+                        &sg_opDefaultTranspose_In1)
+
+/* execute functions for ops */
+
+template <typename TensorType>
+GraphStatus headmatmulImpl(TensorType &out_0, const TensorType &in_0,
+                           const TensorType &in_1,
+                           const QuantUint16Tensor &transpose_in0,
+                           const QuantUint16Tensor &transpose_in1)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+  auto transpose_in0_ = transpose_in0(0, 0, 0, 0);
+  auto transpose_in1_ = transpose_in1(0, 0, 0, 0);
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  auto [b_in2, h_in2, w_in2, d_in2] = in_1.dims();
+
+  if (transpose_in0_ && transpose_in1_) {
+
+    // Q KT head matmul
+    const size_t dims[] = {b_in, w_in, h_in, h_in};
+    out_0.set_dims(dims);
+    debuglog("HeadMatmul execute... dims=(%zdx%zdx%zdx%zd)", out_0.dim(0),
+             out_0.dim(1), out_0.dim(2), out_0.dim(3));
+
+  } else if (transpose_in0_) {
+
+  } else if (transpose_in1_) {
+
+    // QKT V head matmul
+    const size_t dims[] = {b_in, w_in, h_in, d_in2};
+    out_0.set_dims(dims);
+    debuglog("HeadMatmul execute... dims=(%zdx%zdx%zdx%zd)", out_0.dim(0),
+             out_0.dim(1), out_0.dim(2), out_0.dim(3));
+
+    // Todo out matrix needs transpose, we directly calculate the final
+    // dimensions.
+
+  } else {
+  }
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float headmatmulCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_HeadMatmul);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp
new file mode 100644
index 000000000..6407a0e39
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/KVCache.cpp
@@ -0,0 +1,300 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_KVCache);
+
+// op execute function declarations
+template <typename TensorType, typename TensorType1>
+GraphStatus kvcacheImpl(TensorType &out_0, const TensorType &in_0,
+                        const TensorType1 &seq_pos, const Tensor &hidden_dim);
+
+// forward declaration of sample cost function
+static float kvcacheCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((kvcacheImpl<Tensor, Tensor>), "KVCache")
+ */
+DEF_PACKAGE_OP((kvcacheImpl<Tensor, Tensor>), "KVCache")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((kvcacheImpl<PlainFloatTensor,
+ * PlainFloatTensor>), "KVCache", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((kvcacheImpl<PlainFloatTensor,
+ * PlainFloatTensor>), "KVCache", kvcacheCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+DEF_PACKAGE_PARAM_ORDER("KVCache", "hidden_dim", true, nullptr)
+
+/* execute functions for ops */
+
+// #ifndef REFERENCE_OP
+
+// #include "qhmath_hvx.h"
+// #include "hvx_internal.h"
+// #include <hexagon_types.h>
+// #include <stddef.h>
+
+// #define BLOCK_SIZE       (8*1024/VLEN)  /* vector chunks */
+// #define L2FETCH_AHEAD    (BLOCK_SIZE)
+// #define ONE      0x3F800000
+// #define M_ONE    0xAF800000
+
+// int32_t hvx_memcpy_af(float *restrict input, float *restrict output, uint32_t
+// size)
+// {
+//     HVX_Vector *input_v_ptr;
+//     HVX_UVector *output_v_ptr;
+//     HVX_Vector slinep;
+//     HVX_Vector slinec;
+//     HVX_Vector sline;
+//     int32_t block, l2fetch_block;
+//     int32_t leftover = size & 31;
+//     int32_t vectors_in_rounddown = size / 32;
+//     int32_t leftover_size = leftover * sizeof(float);
+
+//     /* Check input arguments. Return error status if some argument has
+//     invalid value */ if ((input == 0) || (output == 0) || (size == 0))
+//     {
+//         return -1;
+//     }
+
+//     input_v_ptr = (HVX_Vector *) input;
+//     output_v_ptr = (HVX_UVector *) output;
+
+//     /*
+//      * If input data is not aligned to HVX vector size, compose aligned
+//      vectors
+//      * from data loaded in slinep and slinec
+//      */
+//     slinep = *input_v_ptr++;
+
+//     /*
+//      * Handle number of whole vectors in input data.
+//      * Don't process last vector in order to avoid out-of-boundary load.
+//      */
+//     for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
+//     {
+//         block = Q6_R_min_RR(i, BLOCK_SIZE);
+//         l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+//         if (l2fetch_block > 0)
+//         {
+//             l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block,
+//             0);
+//         }
+
+//         /* Process one vector at a time */
+//         for (int32_t j = 0; j < block; ++j)
+//         {
+//             slinec = *input_v_ptr++;
+
+//             /* Compose vector of input data from slinec and slinep */
+//             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+//             /* Store results to the output buffer and convert from qf32 to sf
+//             */
+//             *((HVX_UVector *)(output_v_ptr++)) = sline;
+
+//             /* Prepare slinep for next iteration */
+//             slinep = slinec;
+//         }
+//     }
+
+//     /* Handle last whole vector from input data */
+//     if (vectors_in_rounddown > 0)
+//     {
+//         slinec = is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep :
+//         *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t)
+//         input);
+
+//         /* Convert from qf32 to sf, store output and go to handle leftover */
+//         *((HVX_UVector *)(output_v_ptr++)) = sline;
+
+//         slinep = slinec;
+//     }
+
+//     /* Handle leftover elements */
+//     if (leftover > 0)
+//     {
+//         slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
+//                    ? slinep
+//                    : *input_v_ptr++);
+
+//         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+//         /* Store output */
+//         vstu_variable(output_v_ptr, leftover_size, sline);
+//     }
+
+//     return 0;
+// }
+
+// template<typename TensorType,typename TensorType1>
+// GraphStatus kvcacheImpl(TensorType& out_0,
+//                         const TensorType& in_0,
+//                         const TensorType1 &seq_pos,
+//                         const Tensor& hidden_dim)
+
+// {
+//   /*
+//    * add code here
+//    * */
+//   /*
+//    * To have good performance and stability, it is required to avoid heap
+//    memory
+//    * allocation in this function. The heap memory allocation includes but not
+//    * limited to calling malloc, operator new, constructing STL container
+//    objects
+//    * like std::vector with default allocator, and adding items like calling
+//    * std::vector::push_back to STL container objects with default allocator.
+//    *
+//    * Please check in SDK documentation for more information.
+//    */
+
+//   out_0.set_dims(in_0);
+//   auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+//   uint32_t seq_pos_ = seq_pos(0,0,0,0);
+//   // uint32_t hidden_dim_ = hidden_dim(0,0,0,0);
+
+//   // // const size_t dims[] = {b_in, h_in, seq_pos_+1, hidden_dim_};
+//   // // out_0.set_dims(dims);
+
+//   // NSHD
+
+//   auto in_ptr = (float*)in_0.raw_data_const();
+//   auto out_ptr = (float*)out_0.raw_data();
+
+//   out_ptr += seq_pos_ * h_in * w_in * d_in;
+
+//   hvx_memcpy_af(out_ptr, in_ptr, h_in * w_in * d_in);
+
+//   return GraphStatus::Success;
+// }
+
+// #else
+
+template <typename TensorType, typename TensorType1>
+GraphStatus kvcacheImpl(TensorType &out_0, const TensorType &in_0,
+                        const TensorType1 &seq_pos, const Tensor &hidden_dim)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+  uint32_t seq_pos_ = seq_pos(0, 0, 0, 0);
+  const size_t dims[] = {b_in, h_in + seq_pos_, w_in, d_in};
+
+  out_0.set_dims(dims);
+
+  // uint32_t hidden_dim_ = hidden_dim(0,0,0,0);
+
+  // // const size_t dims[] = {b_in, h_in, seq_pos_+1, hidden_dim_};
+  // // out_0.set_dims(dims);
+
+  // NSHD
+
+  DType dtype = in_0.get_dtype();
+
+  const uint8_t *in_ptr = (uint8_t *)in_0.raw_data_const();
+  uint8_t *out_ptr = (uint8_t *)out_0.raw_data();
+
+  if (dtype == DType::QUInt8) {
+
+    out_ptr += seq_pos_ * w_in * d_in;
+    memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(uint8_t));
+
+  } else if (dtype == DType::Float16) {
+
+    out_ptr += seq_pos_ * w_in * d_in * sizeof(float) / 2;
+    memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(float) / 2);
+  } else if (dtype == DType::Float32) {
+
+    out_ptr += seq_pos_ * w_in * d_in * sizeof(float);
+    memcpy(out_ptr, in_ptr, h_in * w_in * d_in * sizeof(float));
+  }
+
+  return GraphStatus::Success;
+}
+
+// #endif
+
+__attribute__((unused)) static float kvcacheCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_KVCache);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp
new file mode 100644
index 000000000..ee8491416
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAAdd.cpp
@@ -0,0 +1,254 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_LLaMAAdd);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1);
+
+// forward declaration of sample cost function
+static float llamaaddCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((llamaaddImpl<Tensor>), "LLaMAAdd")
+ */
+DEF_PACKAGE_OP((llamaaddImpl<Tensor>), "LLaMAAdd")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamaaddImpl<PlainFloatTensor>),
+ * "LLaMAAdd", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamaaddImpl<PlainFloatTensor>),
+ * "LLaMAAdd", llamaaddCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+int32_t hvx_add_af(float *restrict input, float *restrict input2,
+                   float *restrict output, uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)input2;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+
+  // HVX_Vector v128 = Q6_Vb_vsplat_R(0x80808080u);
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  int32_t leftover_size = leftover * sizeof(float);
+
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+      // Our add consider uint8->int8 bugs from QNN.
+      // sline2 = Q6_Vb_vsub_VbVb(sline2, v128);
+      *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2));
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    // sline2 = Q6_Vb_vsub_VbVb(sline2, v128);
+    *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2));
+  }
+
+  // Handle leftover elements.
+  if (leftover_size > 0) {
+    sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++);
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c =
+      (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++);
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    // sline2 = Q6_Vb_vsub_VbVb(sline2, v128);
+    vstu_variable(optr, leftover_size,
+                  Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sline1, sline2)));
+  }
+
+  return 0;
+}
+
+template <typename TensorType>
+GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+  out_0.set_dims(in_0);
+
+  auto in_ptr = (float *)in_0.raw_data_const();
+  auto in2_ptr = (float *)in_1.raw_data_const();
+  auto out_ptr = (float *)out_0.raw_data();
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  size_t size = b_in * h_in * w_in * d_in;
+
+  hvx_add_af(in_ptr, in2_ptr, out_ptr, size);
+
+  return GraphStatus::Success;
+}
+
+#else
+
+template <typename TensorType>
+GraphStatus llamaaddImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // mul
+        for (Idx d = 0; d < d_in; d++) {
+          float inval = in_0(b, h, w, d);
+          float inval2 = in_1(b, h, w, d);
+          float outval = inval + inval2;
+
+          out_0(b, h, w, d) = outval;
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#endif
+
+__attribute__((unused)) static float llamaaddCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_LLaMAAdd);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp
new file mode 100644
index 000000000..ba9ba5d95
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMALinear.cpp
@@ -0,0 +1,209 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_LLaMALinear);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus llamalinearImpl(TensorType &out_0, const TensorType &in_0,
+                            const TensorType &in_1, const TensorType &in_2,
+                            const PlainFloatTensor &in_scale,
+                            const PlainFloatTensor &weight_scale,
+                            const PlainFloatTensor &bias_scale,
+                            const PlainFloatTensor &output_scale);
+
+// forward declaration of sample cost function
+static float llamalinearCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((llamalinearImpl<Tensor>), "LLaMALinear")
+ */
+DEF_PACKAGE_OP((llamalinearImpl<Tensor>), "LLaMALinear")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamalinearImpl<PlainFloatTensor>),
+ * "LLaMALinear", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamalinearImpl<PlainFloatTensor>),
+ * "LLaMALinear", llamalinearCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+DEF_PACKAGE_PARAM_ORDER("LLaMALinear", "in_scale", true, nullptr,
+                        "weight_scale", true, nullptr, "bias_scale", true,
+                        nullptr, "output_scale", true, nullptr)
+
+/* execute functions for ops */
+
+float Round(float num) {
+  float floor_num = floor(num);
+  float ceil_num = ceil(num);
+
+  if (num - floor_num < ceil_num - num) {
+    return floor_num;
+  } else {
+    return ceil_num;
+  }
+}
+
+template <typename TensorType>
+GraphStatus llamalinearImpl(TensorType &out_0, const TensorType &in_0,
+                            const TensorType &in_1, const TensorType &in_2,
+                            const PlainFloatTensor &in_scale,
+                            const PlainFloatTensor &weight_scale,
+                            const PlainFloatTensor &bias_scale,
+                            const PlainFloatTensor &output_scale)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  //  假设输入张量是4维的，NHWC格式
+  int batch_size = in_0.dims()[0];
+  int height = in_0.dims()[1];
+  int width = in_0.dims()[2];
+  int in_features = in_0.dims()[3]; // 输入的通道数
+  int out_features = in_1.dims()[3]; // 输出的特征数（即输出通道数）
+
+  // 检查输入张量的形状是否匹配
+  if (in_1.dims()[0] != 1 || in_1.dims()[1] != 1 ||
+      in_1.dims()[2] != in_features || in_2.dims()[3] != out_features) {
+    return GraphStatus::ErrorFatal;
+  }
+
+  // 获取量化比例
+  float w_scale = weight_scale(0, 0, 0, 0);
+  float i_scale = in_scale(0, 0, 0, 0);
+  float b_scale = bias_scale(0, 0, 0, 0);
+  float o_scale = output_scale(0, 0, 0, 0);
+
+  // 初始化输出张量
+
+  size_t dims[] = {static_cast<size_t>(batch_size), static_cast<size_t>(height),
+                   static_cast<size_t>(width),
+                   static_cast<size_t>(out_features)};
+  out_0.set_dims(dims);
+
+  // only support float bias now.
+  auto in0_ptr = (uint8_t *)in_0.raw_data_const();
+  auto in1_ptr = (uint8_t *)in_1.raw_data_const();
+  auto in2_ptr = (uint8_t *)in_2.raw_data_const();
+  auto out_ptr = (int8_t *)out_0.raw_data();
+
+  // 进行量化Linear乘法
+  for (int b = 0; b < batch_size; ++b) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        for (int n = 0; n < out_features; ++n) {
+          float acc = 0;
+          for (int k = 0; k < in_features; ++k) {
+            int in_index = b * height * width * in_features +
+                           h * width * in_features + w * in_features + k;
+            int weight_index = k * out_features + n;
+            acc +=
+              ((static_cast<int32_t>(in0_ptr[in_index]) - 128) * i_scale) *
+              ((static_cast<int32_t>(in1_ptr[weight_index]) - 128) * w_scale);
+          }
+          // 加上偏置并进行反量化
+          float result = acc;
+          result += (static_cast<int32_t>(in2_ptr[n]) - 128) * b_scale;
+          // 将结果限制在uint8范围内
+          int out_index = b * height * width * out_features +
+                          h * width * out_features + w * out_features + n;
+
+          result = Round(result / o_scale);
+
+          long v = lroundf(result);
+
+          if (v > 127)
+            v = 127;
+
+          if (v < -128)
+            v = -128;
+
+          if (out_0.get_dtype() == DType::QUInt8)
+            v += 128;
+
+          out_ptr[out_index] = static_cast<uint8_t>(v);
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float llamalinearCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_LLaMALinear);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp
new file mode 100644
index 000000000..3deef796c
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAMul.cpp
@@ -0,0 +1,344 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_LLaMAMul);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1);
+
+// forward declaration of sample cost function
+static float llamamulCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((llamamulImpl<Tensor>), "LLaMAMul")
+ */
+DEF_PACKAGE_OP((llamamulImpl<Tensor>), "LLaMAMul")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamamulImpl<PlainFloatTensor>),
+ * "LLaMAMul", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamamulImpl<PlainFloatTensor>),
+ * "LLaMAMul", llamamulCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+int32_t hvx_mul_af(float *restrict input, float *restrict input2,
+                   float *restrict output, uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)input2;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  int32_t leftover_size = leftover * sizeof(float);
+
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+      *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2));
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    *optr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2));
+  }
+
+  // Handle leftover elements.
+  if (leftover_size > 0) {
+    sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++);
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c =
+      (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++);
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    vstu_variable(optr, leftover_size,
+                  Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline1, sline2)));
+  }
+
+  return 0;
+}
+
+int32_t hvx_mul_ahf(__fp16 *restrict input, __fp16 *restrict input2,
+                    __fp16 *restrict output, uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)input2;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 64;
+  int32_t leftover_size = leftover * sizeof(__fp16);
+
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+      *optr++ = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2));
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    *optr++ = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2));
+  }
+
+  // Handle leftover elements.
+  if (leftover_size > 0) {
+    sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) ? sline1p : *iptr++);
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c =
+      (is_in_one_chunk(iptr2, leftover_size, VLEN) ? sline2p : *iptr2++);
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    vstu_variable(optr, leftover_size,
+                  Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2)));
+  }
+
+  return 0;
+}
+
+template <typename TensorType>
+GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  size_t size = b_in * h_in * w_in * d_in;
+
+  DType dtype = in_0.get_dtype();
+
+  if (dtype == DType::Float16) {
+    auto in_ptr = (__fp16 *)in_0.raw_data_const();
+    auto in2_ptr = (__fp16 *)in_1.raw_data_const();
+    auto out_ptr = (__fp16 *)out_0.raw_data();
+
+    hvx_mul_ahf(in_ptr, in2_ptr, out_ptr, size);
+
+  } else {
+    auto in_ptr = (float *)in_0.raw_data_const();
+    auto in2_ptr = (float *)in_1.raw_data_const();
+    auto out_ptr = (float *)out_0.raw_data();
+
+    hvx_mul_af(in_ptr, in2_ptr, out_ptr, size);
+  }
+
+  return GraphStatus::Success;
+}
+
+#else
+
+template <typename TensorType>
+GraphStatus llamamulImpl(TensorType &out_0, const TensorType &in_0,
+                         const TensorType &in_1)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+
+  DType dtype = in_0.get_dtype();
+
+  auto out_ptr = (__fp16 *)out_0.raw_data();
+  auto in_ptr = (__fp16 *)in_0.raw_data_const();
+  auto in_ptr2 = (__fp16 *)in_1.raw_data_const();
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // mul
+        for (Idx d = 0; d < d_in; d++) {
+
+          if (dtype == DType::Float16) {
+
+            __fp16 inval = *in_ptr++;
+            __fp16 inval2 = *in_ptr2++;
+            __fp16 outval = inval * inval2;
+
+            *out_ptr++ = outval;
+          }
+
+          if (dtype == DType::Float32) {
+            float inval = in_0(b, h, w, d);
+            float inval2 = in_1(b, h, w, d);
+            float outval = inval * inval2;
+
+            out_0(b, h, w, d) = outval;
+          }
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#endif
+
+__attribute__((unused)) static float llamamulCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_LLaMAMul);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp
new file mode 100644
index 000000000..9e7d48465
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMAReLU.cpp
@@ -0,0 +1,297 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_LLaMAReLU);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus llamareluImpl(TensorType &out_0, const TensorType &in_0);
+
+// forward declaration of sample cost function
+static float llamareluCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((llamareluImpl<Tensor>), "LLaMAReLU")
+ */
+DEF_PACKAGE_OP((llamareluImpl<Tensor>), "LLaMAReLU")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamareluImpl<PlainFloatTensor>),
+ * "LLaMAReLU", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamareluImpl<PlainFloatTensor>),
+ * "LLaMAReLU", llamareluCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+// #ifndef REFERENCE_OP
+
+// #include "qhmath_hvx.h"
+// #include "hvx_internal.h"
+// #include <hexagon_types.h>
+// #include <stddef.h>
+
+// #define BLOCK_SIZE       (8*1024/VLEN)  /* vector chunks */
+// #define L2FETCH_AHEAD    (BLOCK_SIZE)
+// #define ONE      0x3F800000
+// #define M_ONE    0xAF800000
+
+// int32_t hvx_relu_au8(uint8_t *restrict input, uint8_t *restrict output,
+// uint32_t size)
+// {
+//     HVX_Vector *input_v_ptr;
+//     HVX_UVector *output_v_ptr;
+//     HVX_Vector slinep;
+//     HVX_Vector slinec;
+//     HVX_Vector sline;
+//     int32_t block, l2fetch_block;
+//     int32_t leftover = size & 128;
+//     int32_t vectors_in_rounddown = size / 128;
+//     int32_t leftover_size = leftover * sizeof(uint8_t);
+
+//     /* Check input arguments. Return error status if some argument has
+//     invalid value */ if ((input == 0) || (output == 0) || (size == 0))
+//     {
+//         return -1;
+//     }
+
+//     input_v_ptr = (HVX_Vector *) input;
+//     output_v_ptr = (HVX_UVector *) output;
+
+//     HVX_Vector vO  = Q6_Vb_vsplat_R(0x80808080u);
+
+//     /*
+//      * If input data is not aligned to HVX vector size, compose aligned
+//      vectors
+//      * from data loaded in slinep and slinec
+//      */
+//     slinep = *input_v_ptr++;
+
+//     /*
+//      * Handle number of whole vectors in input data.
+//      * Don't process last vector in order to avoid out-of-boundary load.
+//      */
+//     for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE)
+//     {
+//         block = Q6_R_min_RR(i, BLOCK_SIZE);
+//         l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+//         if (l2fetch_block > 0)
+//         {
+//             l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block,
+//             0);
+//         }
+
+//         /* Process one vector at a time */
+//         for (int32_t j = 0; j < block; ++j)
+//         {
+//             slinec = *input_v_ptr++;
+
+//             /* Compose vector of input data from slinec and slinep */
+//             sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+//             /* Store results to the output buffer and convert from qf32 to sf
+//             */
+//             *((HVX_UVector *)(output_v_ptr++)) =  Q6_Vub_vmax_VubVub(vO,
+//             sline);
+
+//             /* Prepare slinep for next iteration */
+//             slinep = slinec;
+//         }
+//     }
+
+//     /* Handle last whole vector from input data */
+//     if (vectors_in_rounddown > 0)
+//     {
+//         slinec = is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep :
+//         *input_v_ptr++; sline = Q6_V_valign_VVR(slinec, slinep, (size_t)
+//         input);
+
+//         /* Convert from qf32 to sf, store output and go to handle leftover */
+//         *((HVX_UVector *)(output_v_ptr++)) = Q6_Vub_vmax_VubVub(vO, sline);
+
+//         slinep = slinec;
+//     }
+
+//     /* Handle leftover elements */
+//     if (leftover > 0)
+//     {
+//         slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN)
+//                    ? slinep
+//                    : *input_v_ptr++);
+
+//         sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+//         /* Store output */
+//         vstu_variable(output_v_ptr, leftover_size, Q6_Vub_vmax_VubVub(vO,
+//         sline));
+//     }
+
+//     return 0;
+// }
+
+// template<typename TensorType>
+// GraphStatus llamareluImpl(TensorType& out_0,
+//                           const TensorType& in_0)
+
+// {
+//   /*
+//    * add code here
+//    * */
+//   /*
+//    * To have good performance and stability, it is required to avoid heap
+//    memory
+//    * allocation in this function. The heap memory allocation includes but not
+//    * limited to calling malloc, operator new, constructing STL container
+//    objects
+//    * like std::vector with default allocator, and adding items like calling
+//    * std::vector::push_back to STL container objects with default allocator.
+//    *
+//    * Please check in SDK documentation for more information.
+//    */
+
+//   out_0.set_dims(in_0);
+
+//   const auto [bIn, hIn, wIn, dIn] = in_0.dims();
+
+//   auto in_ptr = (uint8_t*)in_0.raw_data_const();
+//   auto out_ptr = (uint8_t*)out_0.raw_data();
+
+//   hvx_relu_au8(out_ptr, in_ptr, bIn * hIn * wIn * dIn * sizeof (uint8_t));
+
+//   return GraphStatus::Success;
+// }
+// #else
+template <typename TensorType>
+GraphStatus llamareluImpl(TensorType &out_0, const TensorType &in_0)
+
+{
+  out_0.set_dims(in_0);
+  // NHWC
+
+  if (in_0.get_dtype() == DType::QUInt8) {
+    auto [b_in, h_in, w_in, d_in] = in_0.dims();
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+          // SiLU
+          for (Idx d = 0; d < d_in; d++) {
+            uint8_t inval = in_0(b, h, w, d);
+            if (inval < 0)
+              inval = 0;
+
+            out_0(b, h, w, d) = inval;
+          }
+        }
+      }
+    }
+  } else if (in_0.get_dtype() == DType::Float16) {
+    auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+    auto out_ptr = (__fp16 *)out_0.raw_data();
+    auto in_ptr = (__fp16 *)in_0.raw_data_const();
+
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+
+          for (Idx d = 0; d < d_in; d++) {
+            __fp16 inval = *in_ptr++;
+            if (inval < 0)
+              inval = 0;
+
+            *out_ptr++ = inval;
+          }
+        }
+      }
+    }
+  } else if (in_0.get_dtype() == DType::Float32) {
+    auto [b_in, h_in, w_in, d_in] = in_0.dims();
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+          for (Idx d = 0; d < d_in; d++) {
+            float inval = in_0(b, h, w, d);
+            if (inval < 0)
+              inval = 0;
+
+            out_0(b, h, w, d) = inval;
+          }
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+// #endif
+
+__attribute__((unused)) static float llamareluCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_LLaMAReLU);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp
new file mode 100644
index 000000000..7e27c5061
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp
@@ -0,0 +1,1368 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0,
+                               const TensorType &in_1,
+                               const PlainFloatTensor &a_scale,
+                               const PlainFloatTensor &b_scale,
+                               const PlainFloatTensor &o_scale);
+
+// forward declaration of sample cost function
+static float llamasupersiluCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((llamasupersiluImpl<Tensor>), "LLaMASuperSiLU")
+ */
+DEF_PACKAGE_OP((llamasupersiluImpl<Tensor>), "LLaMASuperSiLU")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamasupersiluImpl<PlainFloatTensor>),
+ * "LLaMASuperSiLU", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g.
+ * DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamasupersiluImpl<PlainFloatTensor>),
+ * "LLaMASuperSiLU", llamasupersiluCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+DEF_PACKAGE_PARAM_ORDER("LLaMASuperSiLU", "a_scale", true, nullptr, "b_scale",
+                        true, nullptr, "o_scale", true, nullptr)
+
+/* execute functions for ops */
+
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+#define FP16_MANTISA 10
+#define FP16_EXPONENT_MASK 0x1f
+#define FP16_EXPONENT_BIAS 0xf
+#define FP16_MANTISA_MASK 0x000003ff
+#define FP16_SIGN 15
+#define FP16_NEG_1 0xbc00
+#define ROUND_2_SCALE 22
+#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f)
+
+static inline int32_t float_to_fp16s(float input) {
+  union {
+    int32_t i;
+    __fp16 f[2];
+  } fp32 = {.f = {(__fp16)input, (__fp16)input}};
+  return fp32.i;
+}
+
+static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) {
+  union {
+    float f;
+    uint32_t i;
+  } fp32 = {.f = x};
+  return fp32.i;
+}
+
+static const float fp16_c0_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.13239719960243818,
+  0.2216255210749415,
+  0.3447664743728659,
+  0.48137452032585476,
+  0.5716299228719798,
+  0.5547323231605259,
+  0.5046287748870234,
+  0.4999985574626892,
+  0.5000036514755082,
+  0.49475652448004626,
+  0.4441393352532763,
+  0.428500379952032,
+  0.5173297285470642,
+  0.6541461039833616,
+  0.7783931007462818,
+  0.8678015179911097,
+};
+static const float fp16_c1_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.05928005756790343,
+  0.11063222460270064,
+  0.1932879057003057,
+  0.30302440212086995,
+  0.3922924462181049,
+  0.36546332659415875,
+  0.2644148210990377,
+  0.24989020912329707,
+  0.2498532691910313,
+  0.2661055781198988,
+  0.36728015359480604,
+  0.39215270010450015,
+  0.3041825601732039,
+  0.1940762094668647,
+  0.11061794856987572,
+  0.059174800917353595,
+};
+static const float fp16_c2_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.010145494303219278,
+  0.02123968384425681,
+  0.04207468332514667,
+  0.07519946712591977,
+  0.10840620196267145,
+  0.09270738184406795,
+  0.015322371881818012,
+  -0.0009948273994921822,
+  0.0011544907060402412,
+  -0.017040517565094934,
+  -0.09379878876657094,
+  -0.10835043868732394,
+  -0.07558705272699548,
+  -0.04228875316413285,
+  -0.021235740718738055,
+  -0.010124599879590107,
+};
+static const float fp16_c3_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0007841223015974933,
+  0.001850453397354219,
+  0.004187899308371771,
+  0.008640952434084206,
+  0.01414741414964877,
+  0.010117749275618,
+  -0.01654848996354919,
+  -0.02395108399453624,
+  -0.024199111971064446,
+  -0.015783556879607072,
+  0.010407672131558174,
+  0.014137608186323335,
+  0.008698510795258909,
+  0.004213708431213342,
+  0.0018499827774393985,
+  0.0007822799742289481,
+};
+static const float fp16_c4_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  2.3031641204975905e-05,
+  6.150442488966733e-05,
+  0.00015997783736818624,
+  0.00038491646239693526,
+  0.0007283649599237781,
+  0.00034439150914392054,
+  -0.003142246198646662,
+  -0.004120389580321761,
+  0.004246050162553198,
+  0.0030162727520777893,
+  -0.00037312974308425725,
+  -0.0007277242855014247,
+  -0.00038811687679772674,
+  -0.0001611434776868886,
+  -6.14837984586862e-05,
+  -2.297076123375133e-05,
+};
+
+int32_t hvx_supersilu_ahf(uint8_t *restrict input, uint8_t *restrict input2,
+                          uint8_t *restrict output, float a_scale,
+                          float b_scale, float o_scale, uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)input2;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 128;
+  int32_t vectors_in_rounddown = size / 128;
+  // int32_t leftover_size = leftover * sizeof(__fp16);
+
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+
+  // dequantize
+  uint32_t convert = 0x00800080;
+  HVX_Vector convert_vector = Q6_V_vsplat_R(convert);
+
+  HVX_Vector a_scale_vec = Q6_V_vsplat_R(float_to_fp16s(a_scale));
+  HVX_Vector b_scale_vec = Q6_V_vsplat_R(float_to_fp16s(b_scale));
+  HVX_Vector zero_v_sf = Q6_V_vzero();
+
+  // silu
+  HVX_Vector input_min_v_hf;
+  HVX_Vector input_shifted_v_hf;
+  HVX_Vector input_scaled_v;
+  HVX_VectorPair input_vp_qf32;
+  // HVX_Vector input_v_qf16;
+  HVX_Vector mask_idx1_v, mask_idx2_v;
+  HVX_Vector const16_0_v_hf;
+  HVX_Vector zero_v_hf, one_v_hf;
+  HVX_Vector tmp_v;
+  HVX_Vector idx1_v, idx2_v;
+  HVX_Vector scale_v;
+  HVX_DV output_dv;
+  HVX_DV c0_coeff_dv;
+  HVX_VectorPair c0_coeff_vp;
+  HVX_Vector c0_coeff_v;
+  HVX_DV c1_coeff_dv;
+  HVX_VectorPair c1_coeff_vp;
+  HVX_Vector c1_coeff_v;
+  HVX_DV c2_coeff_dv;
+  HVX_VectorPair c2_coeff_vp;
+  HVX_Vector c2_coeff_v;
+  HVX_DV c3_coeff_dv;
+  HVX_VectorPair c3_coeff_vp;
+  HVX_Vector c3_coeff_v;
+  HVX_DV c4_coeff_dv;
+  HVX_VectorPair c4_coeff_vp;
+  HVX_Vector c4_coeff_v;
+
+  scale_v = Q6_Vh_vsplat_R(0x3bfe);
+
+  /* Vector of ones used as mpy neutral element in conversions from hf vector to
+   * qf32 vector pair */
+  one_v_hf = Q6_Vh_vsplat_R(0x3c00);
+
+  /*
+   * Vector of zeroes used as neutral element in hf to qf16 conversions.
+   * NOTE: Some of conversions (i.e conversion of scale factor and coefficients)
+   *       can be avoided in real-time, but this is not done in order to don't
+   *       sacrify code readibility in expense of insignificant performance
+   * improvement.
+   */
+  zero_v_hf = Q6_V_vzero();
+
+  /* Mask for extracting only 4 bits of mantissa */
+  mask_idx1_v = Q6_Vh_vsplat_R(0x000F);
+
+  mask_idx2_v = Q6_V_vsplat_R(0x00001010);
+
+  /* 16.0 in IEEE 16-bit floating-point representation */
+  const16_0_v_hf = Q6_Vh_vsplat_R(0x4c00);
+
+  /*
+   * Prepare vector of input_min values, that is used later in shifting input
+   * range. input_min is low boundary of specified input range.
+   */
+  input_min_v_hf = Q6_Vh_vsplat_R(0xc800);
+
+  /* Convert scale factor from hf to q16. Use the same vector for both formats
+   */
+  scale_v = Q6_Vqf16_vadd_VhfVhf(scale_v, zero_v_hf);
+
+  /* Load coefficients */
+  c0_coeff_v = *((HVX_Vector *)(fp16_c0_coeffs));
+  c1_coeff_v = *((HVX_Vector *)(fp16_c1_coeffs));
+  c2_coeff_v = *((HVX_Vector *)(fp16_c2_coeffs));
+  c3_coeff_v = *((HVX_Vector *)(fp16_c3_coeffs));
+  c4_coeff_v = *((HVX_Vector *)(fp16_c4_coeffs));
+
+  /* Convert coefficients from hf to qf32 format. Use the same vector for both
+   * representations */
+  c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_hf);
+  c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_hf);
+  c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_hf);
+  c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_hf);
+  c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_hf);
+
+  /* Split 32-bit coefficients to lower and upper part in order to obtain them
+   * later with VLUT16. */
+  c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v);
+  c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v);
+  c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v);
+  c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v);
+  c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v);
+
+  // quantize
+  HVX_Vector low_level_vec, high_level_vec, o_scale_vec, es_vec,
+    round_scale_vec;
+  HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080);
+  HVX_Vector vmb = Q6_V_vsplat_R(0x40004000);
+
+  float post_scale_flt = a_scale * b_scale * o_scale;
+  int scexp = flt_getexp(post_scale_flt);
+  int rsh = min_i32(-scexp, 7); // e.g. 0.11 -> 0.88, rsh = 3
+  float rsh_fac = flt_power2(rsh);
+
+  int adj_bias = roundf_i32(128 * rsh_fac);
+  adj_bias = Q6_R_combine_RlRl(adj_bias, adj_bias);
+
+  HVX_Vector vadj = Q6_V_vsplat_R(adj_bias);
+
+  float es = 0.5;
+  low_level_vec = Q6_V_vsplat_R(float_to_fp16s(-128.0f));
+  high_level_vec = Q6_V_vsplat_R(float_to_fp16s(127.0f));
+  o_scale_vec =
+    Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1 << 15)));
+  // one_vec = Q6_V_vsplat_R(float_to_fp16s(1.0f));
+  // o_scale_vec = Q6_Vqf16_vadd_VhfVhf(o_scale_vec, zero_v_hf);
+  es_vec = Q6_V_vsplat_R(float_to_fp16s(es));
+  round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE));
+
+  es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf);
+  round_scale_vec = Q6_Vqf32_vadd_VsfVsf(round_scale_vec, zero_v_sf);
+
+  HVX_Vector expmask = Q6_Vh_vsplat_R(FP16_EXPONENT_MASK);
+  HVX_Vector expbias = Q6_Vh_vsplat_R(FP16_EXPONENT_BIAS);
+  HVX_Vector manmask = Q6_Vh_vsplat_R(FP16_MANTISA_MASK);
+  HVX_Vector exp23 = Q6_Vh_vsplat_R(23 - 1);
+  HVX_Vector exp0 = Q6_Vh_vsplat_R(0 - 1);
+  HVX_Vector negone = Q6_Vh_vsplat_R(FP16_NEG_1);
+  HVX_Vector zero = Q6_V_vzero();
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+      HVX_Vector sline1_high;
+      HVX_Vector sline1_low;
+      // HVX_Vector sline2_high;
+      // HVX_Vector sline2_low;
+
+      {
+        // dequantize  sline1 qf16
+        HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf);
+
+        temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2);
+        HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector);
+        HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector);
+
+        sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec);
+        sline1_low = Q6_Vhf_equals_Vqf16(sline1_low);
+        sline1_high =
+          Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec);
+        sline1_high = Q6_Vhf_equals_Vqf16(sline1_high);
+      }
+
+      // {
+      //   // dequantize  sline2 qf16
+      //   HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf);
+
+      //   temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2);
+      //   HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector);
+      //   HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector);
+
+      //   sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1),
+      //   b_scale_vec); sline2_low = Q6_Vhf_equals_Vqf16(sline2_low);
+      //   sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2),
+      //   b_scale_vec); sline2_high = Q6_Vhf_equals_Vqf16(sline2_high);
+      // }
+
+      {
+        // silu  sline1_low
+        tmp_v = Q6_Vh_vdeal_Vh(sline1_low);
+
+        /* Shift input range from [input_min, input_max] to [0, input_max -
+         * input_min] */
+        input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+        /*
+         * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+         * in order to get corresponding coefficient indexes
+         */
+        input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+        /*
+         * VLUT 16 requires integer indexes. Shift scaled input range from
+         * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+         * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+         * have the same exponent, which means 4 MSB of mantissa carry
+         * information about integer index. Use the same input_scaled_v vector
+         * for hf and qf16 representation
+         */
+        input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+        /* Convert back from qf16 to hf in order to extract integer index  */
+        tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+        /* Only 4 MSB bits of mantissa represent segment index */
+        idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+        /* Ensure only 4 MSB bits of mantissa are used as indexes */
+        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+        idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+        /* Obtain the polynomial coefficients from lookup table */
+        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c0_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c1_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c2_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c3_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+        /* Convert input from hf vector to qf32 vector pair for Horner's
+         * method*/
+        input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf);
+
+        /* Perform evaluation of polynomial using Horner's method */
+        output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                                  Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+        output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                                  Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+        // x * sigmod
+        // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32),
+        // output_dv.V.lo); output_dv.V.hi =
+        // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+        sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV);
+      }
+
+      {
+        // silu  sline1_high
+        tmp_v = Q6_Vh_vdeal_Vh(sline1_high);
+
+        /* Shift input range from [input_min, input_max] to [0, input_max -
+         * input_min] */
+        input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+        /*
+         * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+         * in order to get corresponding coefficient indexes
+         */
+        input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+        /*
+         * VLUT 16 requires integer indexes. Shift scaled input range from
+         * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+         * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+         * have the same exponent, which means 4 MSB of mantissa carry
+         * information about integer index. Use the same input_scaled_v vector
+         * for hf and qf16 representation
+         */
+        input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+        /* Convert back from qf16 to hf in order to extract integer index  */
+        tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+        /* Only 4 MSB bits of mantissa represent segment index */
+        idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+        /* Ensure only 4 MSB bits of mantissa are used as indexes */
+        idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+        idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+        idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+        idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+        /* Obtain the polynomial coefficients from lookup table */
+        c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+        c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c0_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+        c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c1_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+        c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c2_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+        c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c3_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+        c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                             Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+        /* Convert input from hf vector to qf32 vector pair for Horner's
+         * method*/
+        input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf);
+
+        /* Perform evaluation of polynomial using Horner's method */
+        output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                                  Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+        output_dv.V.lo =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+        output_dv.V.lo =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+        output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                                  Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+        output_dv.V.hi =
+          Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+        output_dv.V.hi =
+          Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+        // x * sigmod
+        // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32),
+        // output_dv.V.lo); output_dv.V.hi =
+        // Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+        sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV);
+      }
+
+      HVX_Vector sline_high;
+      HVX_Vector sline_low;
+
+      // {
+      //   // mul
+      //   sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high);
+      //   sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low);
+
+      //   sline_high = Q6_Vhf_equals_Vqf16(sline_high);
+      //   sline_low = Q6_Vhf_equals_Vqf16(sline_low);
+      // }
+
+      HVX_VectorPair mul_output;
+      {
+        // uint8 mul
+        // (a-128)*(b-128) = a*b - 128 (a+b) + 128*128
+        HVX_VectorPair prod1 =
+          Q6_Wuh_vmpyacc_WuhVubVub(Q6_W_vcombine_VV(vmb, vmb), sline1, sline2);
+        HVX_VectorPair prod2 =
+          Q6_Wh_vmpa_WubRub(Q6_W_vcombine_VV(sline2, sline1), 0x80808080);
+        mul_output = Q6_Wh_vsub_WhWh(prod1, prod2);
+
+        mul_output =
+          Q6_W_vshuff_VVR(Q6_V_hi_W(mul_output), Q6_V_lo_W(mul_output), -2);
+
+        // sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low,
+        // Q6_Vhf_equals_Vh(Q6_V_lo_W(mul_output))); sline_high =
+        // Q6_Vqf16_vmpy_VhfVhf(sline1_high,
+        // Q6_Vhf_equals_Vh(Q6_V_hi_W(mul_output)));
+      }
+
+      {
+        // scaling quantize
+        sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, o_scale_vec);
+        sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low));
+        sline_low = Q6_Vh_vadd_VhVh_sat(
+          Q6_Vh_vmpy_VhVh_s1_rnd_sat(Q6_V_lo_W(mul_output), sline_low), vadj);
+
+        sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, o_scale_vec);
+        sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high));
+        sline_high = Q6_Vh_vadd_VhVh_sat(
+          Q6_Vh_vmpy_VhVh_s1_rnd_sat(sline_high, Q6_V_hi_W(mul_output)), vadj);
+
+        HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat(sline_high, sline_low, rsh);
+        sout = Q6_Vb_vdeal_Vb(sout);
+        *optr++ = sout;
+      }
+
+      // {
+      //   // quantize
+      //   HVX_Vector sout1 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_low, o_scale_vec);
+      //   sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec);
+      //   sout1 = Q6_Vhf_equals_Vqf16(sout1);
+      //   sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec);
+      //   sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec);
+      //   HVX_VectorPair sout1_pair =  Q6_Wqf32_vmpy_VhfVhf(sout1, one_vec);
+      //   HVX_Vector sout1_low = Q6_Vsf_equals_Vqf32(
+      //   Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout1_pair),  round_scale_vec));
+      //   HVX_Vector sout1_high =
+      //   Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout1_pair),
+      //   round_scale_vec));
+
+      //   sout1_pair = Q6_W_vshuff_VVR(sout1_high, sout1_low, -4);
+      //   sout1_low = Q6_V_lo_W(sout1_pair);
+      //   sout1_high = Q6_V_hi_W(sout1_pair);
+
+      //   // {
+      //   //     HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA);
+      //   //     exp = Q6_V_vand_VV(exp, expmask);
+      //   //     exp = Q6_Vh_vsub_VhVh(exp, expbias);
+
+      //   //     HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp);
+      //   //     HVX_Vector manzero = Q6_V_vand_VV(sout1, man);
+
+      //   //     HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN);
+      //   //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero);
+
+      //   //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23);
+      //   //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0);
+      //   //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero);
+
+      //   //     HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man);
+      //   //     man = Q6_V_vnot_V(man);
+      //   //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man);
+      //   //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //   //     HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1);
+      //   //     HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero);
+
+      //   //     // exp >= 0
+      //   //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //   exppos_signneg);
+      //   //     tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1);
+
+      //   //     // exp < 0 (-1, 1)
+      //   //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone);
+      //   //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //   //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //   //     sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1);
+      //   // }
+
+      //   sout1_low = Q6_Vw_equals_Vsf(sout1_low);
+      //   sout1_low = Q6_Vw_vasr_VwR(sout1_low, ROUND_2_SCALE);
+      //   sout1_high = Q6_Vw_equals_Vsf(sout1_high);
+      //   sout1_high = Q6_Vw_vasr_VwR(sout1_high, ROUND_2_SCALE);
+
+      //   HVX_Vector sout2 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_high, o_scale_vec);
+      //   sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec);
+      //   sout2 = Q6_Vhf_equals_Vqf16(sout2);
+      //   sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec);
+      //   sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec);
+      //   HVX_VectorPair sout2_pair =  Q6_Wqf32_vmpy_VhfVhf(sout2, one_vec);
+      //   HVX_Vector sout2_low = Q6_Vsf_equals_Vqf32(
+      //   Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout2_pair),  round_scale_vec));
+      //   HVX_Vector sout2_high =
+      //   Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout2_pair),
+      //   round_scale_vec));
+
+      //   sout2_pair = Q6_W_vshuff_VVR(sout2_high, sout2_low, -4);
+      //   sout2_low = Q6_V_lo_W(sout2_pair);
+      //   sout2_high = Q6_V_hi_W(sout2_pair);
+
+      //   // {
+      //   //     HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA);
+      //   //     exp = Q6_V_vand_VV(exp, expmask);
+      //   //     exp = Q6_Vh_vsub_VhVh(exp, expbias);
+
+      //   //     HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp);
+      //   //     HVX_Vector manzero = Q6_V_vand_VV(sout2, man);
+
+      //   //     HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN);
+      //   //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero);
+
+      //   //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23);
+      //   //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0);
+      //   //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero);
+
+      //   //     HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man);
+      //   //     man = Q6_V_vnot_V(man);
+      //   //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man);
+      //   //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //   //     HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1);
+      //   //     HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero);
+
+      //   //     // exp >= 0
+      //   //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //   exppos_signneg);
+      //   //     tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1);
+
+      //   //     // exp < 0 (-1, 1)
+      //   //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone);
+      //   //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //   //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //   //     sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1);
+      //   // }
+
+      //   sout2_low = Q6_Vw_equals_Vsf(sout2_low);
+      //   sout2_low = Q6_Vw_vasr_VwR(sout2_low, ROUND_2_SCALE);
+      //   sout2_high = Q6_Vw_equals_Vsf(sout2_high);
+      //   sout2_high = Q6_Vw_vasr_VwR(sout2_high, ROUND_2_SCALE);
+
+      //   HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout1_high, sout1_low);
+      //   HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout2_high, sout2_low);
+      //   HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h);
+
+      //   *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert);
+      // }
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    o_scale_vec = Q6_V_vsplat_R(float_to_fp16s(o_scale));
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+    HVX_Vector sline1_high;
+    HVX_Vector sline1_low;
+    HVX_Vector sline2_high;
+    HVX_Vector sline2_low;
+
+    {
+      // dequantize  sline1 qf16
+      HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf);
+
+      temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2);
+      HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector);
+      HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector);
+
+      sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec);
+      sline1_low = Q6_Vhf_equals_Vqf16(sline1_low);
+      sline1_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec);
+      sline1_high = Q6_Vhf_equals_Vqf16(sline1_high);
+    }
+
+    {
+      // dequantize  sline2 qf16
+      HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf);
+
+      temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2);
+      HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector);
+      HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector);
+
+      sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), b_scale_vec);
+      sline2_low = Q6_Vhf_equals_Vqf16(sline2_low);
+      sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), b_scale_vec);
+      sline2_high = Q6_Vhf_equals_Vqf16(sline2_high);
+    }
+
+    {
+      // silu  sline1_low
+      tmp_v = Q6_Vh_vdeal_Vh(sline1_low);
+
+      /* Shift input range from [input_min, input_max] to [0, input_max -
+       * input_min] */
+      input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+      /*
+       * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+       * in order to get corresponding coefficient indexes
+       */
+      input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+      /*
+       * VLUT 16 requires integer indexes. Shift scaled input range from
+       * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+       * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+       * have the same exponent, which means 4 MSB of mantissa carry information
+       * about integer index. Use the same input_scaled_v vector for hf and qf16
+       * representation
+       */
+      input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+      /* Convert back from qf16 to hf in order to extract integer index  */
+      tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+      /* Only 4 MSB bits of mantissa represent segment index */
+      idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+      /* Ensure only 4 MSB bits of mantissa are used as indexes */
+      idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+      idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+      idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+      idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+      /* Obtain the polynomial coefficients from lookup table */
+      c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+      c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c0_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c1_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c2_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c3_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+      /* Convert input from hf vector to qf32 vector pair for Horner's method*/
+      input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf);
+
+      /* Perform evaluation of polynomial using Horner's method */
+      output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                                Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+      output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                                Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+      // x * sigmod
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo);
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+      sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV);
+    }
+
+    {
+      // silu  sline1_high
+      tmp_v = Q6_Vh_vdeal_Vh(sline1_high);
+
+      /* Shift input range from [input_min, input_max] to [0, input_max -
+       * input_min] */
+      input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+      /*
+       * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+       * in order to get corresponding coefficient indexes
+       */
+      input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+      /*
+       * VLUT 16 requires integer indexes. Shift scaled input range from
+       * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+       * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+       * have the same exponent, which means 4 MSB of mantissa carry information
+       * about integer index. Use the same input_scaled_v vector for hf and qf16
+       * representation
+       */
+      input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+      /* Convert back from qf16 to hf in order to extract integer index  */
+      tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+      /* Only 4 MSB bits of mantissa represent segment index */
+      idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+      /* Ensure only 4 MSB bits of mantissa are used as indexes */
+      idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+      idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+      idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+      idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+      /* Obtain the polynomial coefficients from lookup table */
+      c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+      c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c0_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c1_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c2_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c3_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+      /* Convert input from hf vector to qf32 vector pair for Horner's method*/
+      input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf);
+
+      /* Perform evaluation of polynomial using Horner's method */
+      output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                                Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+      output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                                Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+      // x * sigmod
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo);
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+      sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV);
+    }
+
+    HVX_Vector sline_high;
+    HVX_Vector sline_low;
+
+    {
+      // mul
+      sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high);
+      sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low);
+
+      sline_high = Q6_Vhf_equals_Vqf16(sline_high);
+      sline_low = Q6_Vhf_equals_Vqf16(sline_low);
+    }
+
+    {
+      // quantize
+      HVX_Vector sout1 = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec);
+      sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec);
+      sout1 = Q6_Vhf_equals_Vqf16(sout1);
+      sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec);
+      sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec);
+
+      {
+        HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA);
+        exp = Q6_V_vand_VV(exp, expmask);
+        exp = Q6_Vh_vsub_VhVh(exp, expbias);
+
+        HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp);
+        HVX_Vector manzero = Q6_V_vand_VV(sout1, man);
+
+        HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN);
+        HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero);
+
+        HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23);
+        HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0);
+        HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero);
+
+        HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man);
+        man = Q6_V_vnot_V(man);
+        HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man);
+        exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+        HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1);
+        HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero);
+
+        // exp >= 0
+        HVX_Vector tsout1 =
+          Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg);
+        tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1);
+
+        // exp < 0 (-1, 1)
+        HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone);
+        tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+        tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+        sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1);
+      }
+
+      sout1 = Q6_Vh_equals_Vhf(sout1);
+
+      HVX_Vector sout2 = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec);
+      sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec);
+      sout2 = Q6_Vhf_equals_Vqf16(sout2);
+      sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec);
+      sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec);
+
+      {
+        HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA);
+        exp = Q6_V_vand_VV(exp, expmask);
+        exp = Q6_Vh_vsub_VhVh(exp, expbias);
+
+        HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp);
+        HVX_Vector manzero = Q6_V_vand_VV(sout2, man);
+
+        HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN);
+        HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero);
+
+        HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23);
+        HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0);
+        HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero);
+
+        HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man);
+        man = Q6_V_vnot_V(man);
+        HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man);
+        exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+        HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1);
+        HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero);
+
+        // exp >= 0
+        HVX_Vector tsout1 =
+          Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg);
+        tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1);
+
+        // exp < 0 (-1, 1)
+        HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone);
+        tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+        tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+        sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1);
+      }
+
+      sout2 = Q6_Vh_equals_Vhf(sout2);
+
+      HVX_Vector reql_h = Q6_Vb_vpack_VhVh_sat(sout2, sout1);
+      *optr++ = Q6_Vb_vadd_VbVb(reql_h, uintconvert);
+    }
+  }
+
+  // // Handle leftover elements.
+  // if (leftover_size > 0) {
+  //   sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN)
+  //                   ? sline1p
+  //                   : *iptr++);
+  //   sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+  //   sline2c = (is_in_one_chunk(iptr2, leftover_size, VLEN)
+  //                   ? sline2p
+  //                   : *iptr2++);
+  //   sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2);
+
+  //   vstu_variable(optr, leftover_size,
+  //   Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2)));
+  // }
+
+  return 0;
+}
+
+template <typename TensorType>
+GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0,
+                               const TensorType &in_1,
+                               const PlainFloatTensor &a_scale,
+                               const PlainFloatTensor &b_scale,
+                               const PlainFloatTensor &o_scale)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  size_t size = b_in * h_in * w_in * d_in;
+
+  float a_scale_ = a_scale(0, 0, 0, 0);
+  float b_scale_ = b_scale(0, 0, 0, 0);
+  float o_scale_ = o_scale(0, 0, 0, 0);
+
+  auto in_ptr = (uint8_t *)in_0.raw_data_const();
+  auto in_ptr2 = (uint8_t *)in_1.raw_data_const();
+
+  auto out_ptr = (uint8_t *)out_0.raw_data();
+
+  DType dtype = in_0.get_dtype();
+
+  if (dtype == DType::QUInt8 && out_0.get_dtype() == DType::QUInt8) {
+    hvx_supersilu_ahf(in_ptr, in_ptr2, out_ptr, a_scale_, b_scale_,
+                      1.0f / o_scale_, size);
+  }
+
+  return GraphStatus::Success;
+}
+
+#else
+
+template <typename TensorType>
+GraphStatus llamasupersiluImpl(TensorType &out_0, const TensorType &in_0,
+                               const TensorType &in_1,
+                               const PlainFloatTensor &a_scale,
+                               const PlainFloatTensor &b_scale,
+                               const PlainFloatTensor &o_scale)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+  out_0.set_dims(in_0);
+
+  float a_scale_ = a_scale(0, 0, 0, 0);
+  float b_scale_ = b_scale(0, 0, 0, 0);
+  float o_scale_ = o_scale(0, 0, 0, 0);
+
+  auto in_ptr = (uint8_t *)in_0.raw_data_const();
+  auto in_ptr2 = (uint8_t *)in_1.raw_data_const();
+
+  auto out_ptr = (uint8_t *)out_0.raw_data();
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // mul
+        for (Idx d = 0; d < d_in; d++) {
+
+          int32_t a_inval = static_cast<int32_t>(*in_ptr++);
+          float a_inval_fp16 = (a_inval - 128) * a_scale_;
+
+          int32_t b_inval = static_cast<int32_t>(*in_ptr2++);
+          float b_inval_fp16 = (b_inval - 128) * b_scale_;
+
+          a_inval_fp16 = a_inval_fp16 * (1 / (1 + expf(-a_inval_fp16)));
+
+          float inval = a_inval_fp16 * b_inval_fp16;
+
+          long v = lroundf(inval / o_scale_);
+
+          if (v > 127)
+            v = 127;
+
+          if (v < -128)
+            v = -128;
+
+          v += 128;
+
+          *out_ptr++ = static_cast<uint8_t>(v);
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#endif
+
+__attribute__((unused)) static float llamasupersiluCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp
new file mode 100644
index 000000000..7c3480944
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/QLayerNorm.cpp
@@ -0,0 +1,350 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_QLayerNorm);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0,
+                           const TensorType &weights, const TensorType &bias);
+
+// forward declaration of sample cost function
+static float qlayernormCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((qlayernormImpl<Tensor>), "QLayerNorm")
+ */
+DEF_PACKAGE_OP((qlayernormImpl<Tensor>), "QLayerNorm")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((qlayernormImpl<PlainFloatTensor>),
+ * "QLayerNorm", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((qlayernormImpl<PlainFloatTensor>),
+ * "QLayerNorm", qlayernormCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+int32_t hvx_qlayernorm_af(float *restrict input, float *restrict weights,
+                          float *restrict bias, float *restrict output,
+                          uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)weights;
+  HVX_Vector *iptr3 = (HVX_Vector *)bias;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+  HVX_Vector sline3p, sline3c, sline3;
+
+  HVX_Vector zero;
+
+  float __attribute__((aligned(VLEN))) tmp_buf[32];
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  int32_t leftover_size = leftover * sizeof(float);
+
+  zero = Q6_V_vzero();
+
+  // sline1p = *iptr++;
+
+  // x sum
+  HVX_Vector xsum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero());
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, sline1);
+
+      sline1p = sline1c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+    xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, sline1);
+  }
+
+  union {
+    float f;
+    uint32_t ui;
+  } mean_value;
+  mean_value.f = 0.0f;
+
+  for (int32_t i = 64; i >= 4; i >>= 1) {
+    xsum = Q6_Vqf32_vadd_Vqf32Vqf32(xsum, Q6_V_vlalign_VVR(xsum, zero, i));
+  }
+
+  xsum = Q6_Vsf_equals_Vqf32(xsum);
+  *(HVX_Vector *)tmp_buf = xsum;
+
+  mean_value.f = xsum[31] / size;
+
+  // x-e^2 sum
+  iptr = (HVX_Vector *)input;
+  sline1p = *iptr++;
+
+  HVX_Vector x2sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero());
+
+  HVX_Vector mean_vsf = Q6_V_vsplat_R(mean_value.ui);
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+      sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf);
+      x2sum = Q6_Vqf32_vadd_Vqf32Vqf32(
+        x2sum, Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline1));
+
+      sline1p = sline1c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf);
+    x2sum =
+      Q6_Vqf32_vadd_Vqf32Vqf32(x2sum, Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline1));
+  }
+
+  float epsilon_ = 1e-5;
+  union {
+    float f;
+    uint32_t ui;
+  } sum_value;
+  sum_value.f = 0.0f;
+
+  for (int32_t i = 64; i >= 4; i >>= 1) {
+    x2sum = Q6_Vqf32_vadd_Vqf32Vqf32(x2sum, Q6_V_vlalign_VVR(x2sum, zero, i));
+  }
+
+  x2sum = Q6_Vsf_equals_Vqf32(x2sum);
+  *(HVX_Vector *)tmp_buf = x2sum;
+
+  sum_value.f = 1.0f / sqrtf(x2sum[31] / size + epsilon_);
+
+  // x * 1/rsqrt(sum)
+  iptr = (HVX_Vector *)input;
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+  sline3p = *iptr3++;
+
+  HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui);
+  HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero());
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr3 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline3c = *iptr3++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights);
+      sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)bias);
+
+      sline1 = Q6_Vqf32_vsub_Vqf32Vqf32(sline1, mean_vsf);
+
+      HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline2);
+      middle_value_qf32 =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+      middle_value_qf32 = Q6_Vqf32_vadd_Vqf32Vqf32(middle_value_qf32, sline3);
+
+      *optr++ = Q6_Vsf_equals_Vqf32(middle_value_qf32);
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+      sline3p = sline3c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights);
+
+    sline3c = is_aligned(iptr3, VLEN) && leftover == 0 ? sline3p : *iptr3++;
+    sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)weights);
+
+    sline1 = Q6_Vqf32_vsub_VsfVsf(sline1, mean_vsf);
+
+    HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, sline2);
+    middle_value_qf32 =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+    middle_value_qf32 = Q6_Vqf32_vadd_Vqf32Vqf32(middle_value_qf32, sline3);
+
+    *optr++ = Q6_Vsf_equals_Vqf32(middle_value_qf32);
+  }
+
+  if (leftover_size > 0)
+    return -1;
+
+  return 0;
+}
+
+template <typename TensorType>
+GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0,
+                           const TensorType &weights, const TensorType &bias)
+
+{
+  out_0.set_dims(in_0);
+
+  // NHWC
+
+  auto in_ptr = (float *)in_0.raw_data_const();
+  auto out_ptr = (float *)out_0.raw_data();
+  auto weights_ptr = (float *)weights.raw_data_const();
+  auto bias_ptr = (float *)bias.raw_data_const();
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // RMS
+        hvx_qlayernorm_af(in_ptr, weights_ptr, bias_ptr, out_ptr, d_in);
+
+        in_ptr += d_in;
+        out_ptr += d_in;
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#else
+
+template <typename TensorType>
+GraphStatus qlayernormImpl(TensorType &out_0, const TensorType &in_0,
+                           const TensorType &weights, const TensorType &bias)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  return GraphStatus::Success;
+}
+
+#endif
+
+__attribute__((unused)) static float qlayernormCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_QLayerNorm);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp
new file mode 100644
index 000000000..922cb8ff2
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/RMSNorm.cpp
@@ -0,0 +1,939 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_RMSNorm);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0,
+                        const TensorType &weights);
+
+// forward declaration of sample cost function
+static float rmsnormCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((rmsnormImpl<Tensor>), "RMSNorm")
+ */
+DEF_PACKAGE_OP((rmsnormImpl<Tensor>), "RMSNorm")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((rmsnormImpl<PlainFloatTensor>), "RMSNorm",
+ * SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((rmsnormImpl<PlainFloatTensor>),
+ * "RMSNorm", rmsnormCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+int32_t hvx_rmsnorm_af(float *restrict input, float *restrict weights,
+                       float *restrict output, uint32_t size) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)weights;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  int32_t leftover_size = leftover * sizeof(float);
+
+  sline1p = *iptr++;
+
+  // ^2 sum
+  HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero());
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+      sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+
+      sline1p = sline1c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+  }
+
+  float epsilon_ = 1e-6;
+  union {
+    float f;
+    uint32_t ui;
+  } sum_value;
+  sum_value.f = 0.0f;
+
+  HVX_Vector zero = Q6_V_vzero();
+
+  for (int32_t i = 64; i >= 4; i >>= 1) {
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i));
+  }
+
+  sum = Q6_Vsf_equals_Vqf32(sum);
+  sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_);
+
+  // x * 1/rsqrt(sum)
+  iptr = (HVX_Vector *)input;
+  sline1p = *iptr++;
+  sline2p = *iptr2++;
+
+  HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui);
+  HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero());
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline2c = *iptr2++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+      sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights);
+
+      HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, sline2);
+      *optr++ = Q6_Vsf_equals_Vqf32(
+        Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32));
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+    sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++;
+    sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)weights);
+
+    HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, sline2);
+    *optr++ = Q6_Vsf_equals_Vqf32(
+      Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32));
+  }
+
+  if (leftover_size > 0)
+    return -1;
+
+  return 0;
+}
+
+static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) {
+  union {
+    float f;
+    uint32_t i;
+  } fp32 = {.f = x};
+  return fp32.i;
+}
+
+static inline int32_t float_to_fp16s(float input) {
+  union {
+    int32_t i;
+    __fp16 f[2];
+  } fp32 = {.f = {(__fp16)input, (__fp16)input}};
+  return fp32.i;
+}
+
+#define FLOAT_MANTISA 23
+#define FLOAT_EXPONENT_MASK 0xff
+#define FLOAT_EXPONENT_BIAS 0x7f
+#define FLOAT_MANTISA_MASK 0x007fffff
+#define FLOAT_SIGN 31
+#define FLOAT_NEG_1 0xBF800000
+#define ROUND_2_SCALE 22
+#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f)
+
+int32_t hvx_rmsnorm_auint8(float *restrict input, float *restrict weights,
+                           uint8_t *restrict output, uint32_t size,
+                           float scale) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)weights;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+  HVX_Vector sline3p, sline3c, sline3;
+  HVX_Vector sline4p, sline4c, sline4;
+  HVX_Vector slinewp, slinewc, slinew;
+
+  HVX_Vector sout1, sout2, sout3, sout4;
+  HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, round_scale_vec;
+
+  float low_level = -128.0f;
+  float high_level = 127.0f;
+
+  float es = 0.5f;
+  low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level));
+  high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level));
+  scale_vec = Q6_V_vsplat_R(float_to_bits(scale));
+  es_vec = Q6_V_vsplat_R(float_to_bits(es));
+  round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE));
+
+  HVX_Vector zero_v_sf = Q6_V_vzero();
+  scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf);
+  es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf);
+
+  HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080);
+
+  // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK);
+  // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS);
+  // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK);
+  // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1);
+  // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1);
+  // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1);
+  HVX_Vector zero = Q6_V_vzero();
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  // int32_t leftover_size = leftover * sizeof(float);
+
+  sline1p = *iptr++;
+
+  // ^2 sum
+  HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero());
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+      sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+
+      sline1p = sline1c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+  }
+
+  float epsilon_ = 1e-6;
+  union {
+    float f;
+    uint32_t ui;
+  } sum_value;
+  sum_value.f = 0.0f;
+
+  for (int32_t i = 64; i >= 4; i >>= 1) {
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i));
+  }
+
+  sum = Q6_Vsf_equals_Vqf32(sum);
+  sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_);
+
+  // x * 1/rsqrt(sum)
+  iptr = (HVX_Vector *)input;
+
+  sline1p = *iptr++;
+  sline2p = *iptr++;
+  sline3p = *iptr++;
+  sline4p = *iptr++;
+
+  slinewp = *iptr2++;
+
+  HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui);
+  HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero());
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; j += 4) {
+
+      {
+        sline1c = *iptr++;
+        slinewc = *iptr2++;
+        sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew);
+        sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      sout1 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1, scale_vec);
+      sout1 = Q6_Vqf32_vadd_Vqf32Vqf32(sout1, es_vec);
+      sout1 = Q6_Vsf_equals_Vqf32(sout1);
+      sout1 = Q6_Vsf_vmin_VsfVsf(sout1, high_level_vec);
+      sout1 = Q6_Vsf_vmax_VsfVsf(sout1, low_level_vec);
+      sout1 = Q6_Vqf32_vmpy_VsfVsf(sout1, round_scale_vec);
+      sout1 = Q6_Vsf_equals_Vqf32(sout1);
+
+      // {
+      //     HVX_Vector exp = Q6_Vw_vasr_VwR(sout1, FLOAT_MANTISA);
+      //     exp = Q6_V_vand_VV(exp, expmask);
+      //     exp = Q6_Vw_vsub_VwVw(exp, expbias);
+
+      //     HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp);
+      //     HVX_Vector manzero = Q6_V_vand_VV(sout1, man);
+
+      //     HVX_Vector sign = Q6_Vw_vasr_VwR(sout1, FLOAT_SIGN);
+      //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero);
+
+      //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23);
+      //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0);
+      //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero);
+
+      //     HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout1, man);
+      //     man = Q6_V_vnot_V(man);
+      //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man);
+      //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //     HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout1, 1);
+      //     HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero);
+
+      //     // exp >= 0
+      //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //     exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1);
+
+      //     // exp < 0 (-1, 1)
+      //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone);
+      //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //     sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1);
+      // }
+
+      sout1 = Q6_Vw_equals_Vsf(sout1);
+      sout1 = Q6_Vw_vasr_VwR(sout1, ROUND_2_SCALE);
+      // sout1 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout1,
+      // Q6_V_vzero()), 0);
+
+      {
+        sline2c = *iptr++;
+        slinewc = *iptr2++;
+        sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew);
+        sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      sout2 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline2, scale_vec);
+      sout2 = Q6_Vqf32_vadd_Vqf32Vqf32(sout2, es_vec);
+      sout2 = Q6_Vsf_equals_Vqf32(sout2);
+      sout2 = Q6_Vsf_vmin_VsfVsf(sout2, high_level_vec);
+      sout2 = Q6_Vsf_vmax_VsfVsf(sout2, low_level_vec);
+      sout2 = Q6_Vqf32_vmpy_VsfVsf(sout2, round_scale_vec);
+      sout2 = Q6_Vsf_equals_Vqf32(sout2);
+
+      // {
+      //     HVX_Vector exp = Q6_Vw_vasr_VwR(sout2, FLOAT_MANTISA);
+      //     exp = Q6_V_vand_VV(exp, expmask);
+      //     exp = Q6_Vw_vsub_VwVw(exp, expbias);
+
+      //     HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp);
+      //     HVX_Vector manzero = Q6_V_vand_VV(sout2, man);
+
+      //     HVX_Vector sign = Q6_Vw_vasr_VwR(sout2, FLOAT_SIGN);
+      //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero);
+
+      //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23);
+      //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0);
+      //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero);
+
+      //     HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout2, man);
+      //     man = Q6_V_vnot_V(man);
+      //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man);
+      //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //     HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout2, 1);
+      //     HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero);
+
+      //     // exp >= 0
+      //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //     exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1);
+
+      //     // exp < 0 (-1, 1)
+      //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone);
+      //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //     sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1);
+      // }
+
+      sout2 = Q6_Vw_equals_Vsf(sout2);
+      sout2 = Q6_Vw_vasr_VwR(sout2, ROUND_2_SCALE);
+      // sout2 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout2,
+      // Q6_V_vzero()), 0);
+
+      {
+        sline3c = *iptr++;
+        slinewc = *iptr2++;
+        sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew);
+        sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      sout3 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline3, scale_vec);
+      sout3 = Q6_Vqf32_vadd_Vqf32Vqf32(sout3, es_vec);
+      sout3 = Q6_Vsf_equals_Vqf32(sout3);
+      sout3 = Q6_Vsf_vmin_VsfVsf(sout3, high_level_vec);
+      sout3 = Q6_Vsf_vmax_VsfVsf(sout3, low_level_vec);
+      sout3 = Q6_Vqf32_vmpy_VsfVsf(sout3, round_scale_vec);
+      sout3 = Q6_Vsf_equals_Vqf32(sout3);
+
+      // {
+      //     HVX_Vector exp = Q6_Vw_vasr_VwR(sout3, FLOAT_MANTISA);
+      //     exp = Q6_V_vand_VV(exp, expmask);
+      //     exp = Q6_Vw_vsub_VwVw(exp, expbias);
+
+      //     HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp);
+      //     HVX_Vector manzero = Q6_V_vand_VV(sout3, man);
+
+      //     HVX_Vector sign = Q6_Vw_vasr_VwR(sout3, FLOAT_SIGN);
+      //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero);
+
+      //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23);
+      //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0);
+      //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero);
+
+      //     HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout3, man);
+      //     man = Q6_V_vnot_V(man);
+      //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout3, man);
+      //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //     HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout3, 1);
+      //     HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero);
+
+      //     // exp >= 0
+      //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //     exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout3, tsout1);
+
+      //     // exp < 0 (-1, 1)
+      //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout3, negone);
+      //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //     sout3 = Q6_V_vmux_QVV(expgte23, sout3, tsout1);
+      // }
+
+      sout3 = Q6_Vw_equals_Vsf(sout3);
+      sout3 = Q6_Vw_vasr_VwR(sout3, ROUND_2_SCALE);
+      // sout3 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout3,
+      // Q6_V_vzero()), 0);
+
+      {
+        sline4c = *iptr++;
+        slinewc = *iptr2++;
+        sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew);
+        sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      sout4 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline4, scale_vec);
+      sout4 = Q6_Vqf32_vadd_Vqf32Vqf32(sout4, es_vec);
+      sout4 = Q6_Vsf_equals_Vqf32(sout4);
+      sout4 = Q6_Vsf_vmin_VsfVsf(sout4, high_level_vec);
+      sout4 = Q6_Vsf_vmax_VsfVsf(sout4, low_level_vec);
+      sout4 = Q6_Vqf32_vmpy_VsfVsf(sout4, round_scale_vec);
+      sout4 = Q6_Vsf_equals_Vqf32(sout4);
+
+      // {
+      //     HVX_Vector exp = Q6_Vw_vasr_VwR(sout4, FLOAT_MANTISA);
+      //     exp = Q6_V_vand_VV(exp, expmask);
+      //     exp = Q6_Vw_vsub_VwVw(exp, expbias);
+
+      //     HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp);
+      //     HVX_Vector manzero = Q6_V_vand_VV(sout4, man);
+
+      //     HVX_Vector sign = Q6_Vw_vasr_VwR(sout4, FLOAT_SIGN);
+      //     HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero);
+
+      //     HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23);
+      //     HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0);
+      //     HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero);
+
+      //     HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout4, man);
+      //     man = Q6_V_vnot_V(man);
+      //     HVX_Vector exppos_signpos = Q6_V_vand_VV(sout4, man);
+      //     exppos_signneg = Q6_V_vand_VV(exppos_signneg, man);
+      //     HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout4, 1);
+      //     HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero);
+
+      //     // exp >= 0
+      //     HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos,
+      //     exppos_signneg); tsout1 = Q6_V_vmux_QVV(maneqzero, sout4, tsout1);
+
+      //     // exp < 0 (-1, 1)
+      //     HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout4, negone);
+      //     tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2);
+
+      //     tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2);
+      //     sout4 = Q6_V_vmux_QVV(expgte23, sout4, tsout1);
+      // }
+
+      sout4 = Q6_Vw_equals_Vsf(sout4);
+      sout4 = Q6_Vw_vasr_VwR(sout4, ROUND_2_SCALE);
+      // sout4 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout4,
+      // Q6_V_vzero()), 0);
+
+      HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout2, sout1);
+      HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout4, sout3);
+      HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h);
+
+      *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert);
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+      sline3p = sline3c;
+      sline4p = sline4c;
+
+      slinewp = slinewc;
+    }
+  }
+
+  return 0;
+}
+
+int32_t hvx_rmsnorm_auint8_opt(float *restrict input, float *restrict weights,
+                               uint8_t *restrict output, uint32_t size,
+                               float scale) {
+  if ((input == NULL) || (output == NULL) || (size == 0)) {
+    return -1;
+  }
+
+  HVX_Vector *iptr = (HVX_Vector *)input;
+  HVX_Vector *iptr2 = (HVX_Vector *)weights;
+  HVX_UVector *optr = (HVX_UVector *)output;
+  HVX_Vector sline1p, sline1c, sline1;
+  HVX_Vector sline2p, sline2c, sline2;
+  HVX_Vector sline3p, sline3c, sline3;
+  HVX_Vector sline4p, sline4c, sline4;
+  HVX_Vector slinewp, slinewc, slinew;
+
+  // HVX_Vector sout1, sout2, sout3, sout4;
+  // HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec,
+  // round_scale_vec;
+
+  // float low_level = -128.0f;
+  // float high_level = 127.0f;
+
+  // float es = 0.5f;
+  // low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level));
+  // high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level));
+  // scale_vec = Q6_V_vsplat_R(float_to_bits(scale));
+  // es_vec = Q6_V_vsplat_R(float_to_bits(es));
+  // round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE));
+
+  // HVX_Vector zero_v_sf = Q6_V_vzero();
+  // scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf);
+  // es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf);
+
+  // HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080);
+
+  // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK);
+  // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS);
+  // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK);
+  // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1);
+  // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1);
+  // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1);
+  HVX_Vector zero = Q6_V_vzero();
+
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  // int32_t leftover_size = leftover * sizeof(float);
+
+  sline1p = *iptr++;
+
+  // ^2 sum
+  HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero());
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; ++j) {
+      sline1c = *iptr++;
+      sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+
+      sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+
+      sline1p = sline1c;
+    }
+  }
+
+  if (vectors_in_rounddown > 0) {
+
+    sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++;
+    sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1));
+  }
+
+  float epsilon_ = 1e-6;
+  union {
+    float f;
+    uint32_t ui;
+  } sum_value;
+  sum_value.f = 0.0f;
+
+  for (int32_t i = 64; i >= 4; i >>= 1) {
+    sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i));
+  }
+
+  sum = Q6_Vsf_equals_Vqf32(sum);
+  sum_value.f = 1.0f / sqrtf(*((float *)&sum + 31) / size + epsilon_);
+
+  // x * 1/rsqrt(sum)
+  iptr = (HVX_Vector *)input;
+
+  sline1p = *iptr++;
+  sline2p = *iptr++;
+  sline3p = *iptr++;
+  sline4p = *iptr++;
+
+  slinewp = *iptr2++;
+
+  HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui);
+  HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero());
+
+  float post_scale_flt = scale / 64.0f;
+  int scexp = flt_getexp(post_scale_flt);
+  int rsh = min_i32(-scexp, 7); // e.g. 0.11 -> 0.88, rsh = 3
+  float rsh_fac = flt_power2(rsh);
+
+  int adj_bias = roundf_i32(128 * rsh_fac);
+  adj_bias = Q6_R_combine_RlRl(adj_bias, adj_bias);
+
+  HVX_Vector zero_v_sf = Q6_V_vzero();
+  float es = 0.5f;
+  HVX_Vector es_vec = Q6_V_vsplat_R(float_to_fp16s(es));
+  es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf);
+
+  HVX_Vector vadj = Q6_V_vsplat_R(adj_bias);
+  HVX_Vector o_scale_vec =
+    Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1 << 15)));
+
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+      l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    for (int32_t j = 0; j < block; j += 4) {
+
+      {
+        sline1c = *iptr++;
+        slinewc = *iptr2++;
+        sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew);
+        sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      {
+        sline2c = *iptr++;
+        slinewc = *iptr2++;
+        sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew);
+        sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      HVX_Vector sline_low =
+        Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline2, sline1));
+      sline_low = Q6_Vqf16_vadd_Vqf16Vqf16(sline_low, es_vec);
+      sline_low = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec);
+      sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low));
+      sline_low = Q6_Vh_vadd_VhVh_sat(
+        Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_low, 0x00400040), vadj);
+
+      sline_low = Q6_Vh_vdeal_Vh(sline_low);
+
+      {
+        sline3c = *iptr++;
+        slinewc = *iptr2++;
+        sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew);
+        sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      {
+        sline4c = *iptr++;
+        slinewc = *iptr2++;
+        sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t)input);
+        slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights);
+
+        HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew);
+        sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32);
+
+        slinewp = slinewc;
+      }
+
+      HVX_Vector sline_high =
+        Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline4, sline3));
+      sline_high = Q6_Vqf16_vadd_Vqf16Vqf16(sline_high, es_vec);
+      sline_high = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec);
+      sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high));
+      sline_high = Q6_Vh_vadd_VhVh_sat(
+        Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_high, 0x00400040), vadj);
+
+      sline_high = Q6_Vh_vdeal_Vh(sline_high);
+
+      HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat(sline_high, sline_low, rsh);
+      sout = Q6_Vb_vdeal_Vb(sout);
+      *optr++ = sout;
+
+      sline1p = sline1c;
+      sline2p = sline2c;
+      sline3p = sline3c;
+      sline4p = sline4c;
+
+      slinewp = slinewc;
+    }
+  }
+
+  return 0;
+}
+
+template <typename TensorType>
+GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0,
+                        const TensorType &weights)
+
+{
+  out_0.set_dims(in_0);
+
+  // NHWC
+
+  auto in_ptr = (float *)in_0.raw_data_const();
+  auto weights_ptr = (float *)weights.raw_data_const();
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+  DType dtype = out_0.get_dtype();
+
+  if (dtype == DType::Float32) {
+
+    auto out_ptr = (float *)out_0.raw_data();
+
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+          // RMS
+          hvx_rmsnorm_af(in_ptr, weights_ptr, out_ptr, d_in);
+
+          in_ptr += d_in;
+          out_ptr += d_in;
+        }
+      }
+    }
+
+  } else if (dtype == DType::QUInt8) {
+
+    auto out_ptr = (uint8_t *)out_0.raw_data();
+    float scale_ = out_0.get_interface_scale();
+
+    scale_ = 1.0f / scale_;
+
+    for (Idx b = 0; b < b_in; b++) {
+      for (Idx h = 0; h < h_in; h++) {
+        for (Idx w = 0; w < w_in; w++) {
+          // RMS
+          hvx_rmsnorm_auint8(in_ptr, weights_ptr, out_ptr, d_in, scale_);
+
+          in_ptr += d_in;
+          out_ptr += d_in;
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#else
+
+template <typename TensorType>
+GraphStatus rmsnormImpl(TensorType &out_0, const TensorType &in_0,
+                        const TensorType &weights)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  out_0.set_dims(in_0);
+  // NHWC
+
+  float epsilon_ = 1e-6;
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // RMS
+        float sum_squares = 0.0f;
+        for (Idx d = 0; d < d_in; d++) {
+          float inval = in_0(b, h, w, d);
+          sum_squares += inval * inval;
+        }
+
+        // debuglog("silu execute... sum_squares=(%f)", sum_squares);
+
+        float rms = sqrtf(sum_squares / d_in + epsilon_);
+        debuglog("rms execute... sum_squares=(%f)", 1.0f / rms);
+        debuglog("rms execute... sum_squares=(%f)", sum_squares);
+
+        for (Idx d = 0; d < d_in; d++) {
+          float inval = in_0(b, h, w, d);
+          float weight = weights(0, 0, 0, d);
+
+          out_0(b, h, w, d) = inval * weight / rms;
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+#endif
+
+__attribute__((unused)) static float rmsnormCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_RMSNorm);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp
new file mode 100644
index 000000000..73f1ee050
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SiLU.cpp
@@ -0,0 +1,1425 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_SiLU);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus siluImpl(TensorType &out_0, const TensorType &in_0);
+
+// forward declaration of sample cost function
+static float siluCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((siluImpl<Tensor>), "SiLU")
+ */
+DEF_PACKAGE_OP((siluImpl<Tensor>), "SiLU")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((siluImpl<PlainFloatTensor>), "SiLU",
+ * SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((siluImpl<PlainFloatTensor>),
+ * "SiLU", siluCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+#ifndef REFERENCE_OP
+
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include <hexagon_types.h>
+#include <stddef.h>
+
+#define BLOCK_SIZE (8 * 1024 / VLEN) /* vector chunks */
+#define L2FETCH_AHEAD (BLOCK_SIZE)
+
+static inline int32_t float_to_fp16s(float input) {
+  union {
+    int32_t i;
+    __fp16 f[2];
+  } fp32 = {.f = {(__fp16)input, (__fp16)input}};
+  return fp32.i;
+}
+
+static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) {
+  union {
+    float f;
+    uint32_t i;
+  } fp32 = {.f = x};
+  return fp32.i;
+}
+
+/* Polynomial coefficients */
+static const float c0_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.1329913082916337,
+  0.22308514882873062,
+  0.347752862580421,
+  0.4845759228057826,
+  0.5724725619240282,
+  0.5532613332075828,
+  0.5041402176920755,
+  0.4999998945071365,
+  0.500005251569411,
+  0.494975832882496,
+  0.44426898861108216,
+  0.42865769845972046,
+  0.5186084804556764,
+  0.6556781472810073,
+  0.7780379623543565,
+  0.8670752648575938,
+};
+static const float c1_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0595948414501292,
+  0.11153317908159224,
+  0.19545701719511055,
+  0.3058925677063833,
+  0.3932668307015573,
+  0.3630691859433203,
+  0.26302954631996744,
+  0.2499155333713503,
+  0.24983690256810576,
+  0.26551386754654915,
+  0.3670764533308477,
+  0.39196882072648825,
+  0.3030372911476408,
+  0.19296191313371913,
+  0.11084562978488391,
+  0.059559556604464964,
+};
+static const float c2_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.010207999856103376,
+  0.02144807112969563,
+  0.04266485934992188,
+  0.07616157468726052,
+  0.10882760873715347,
+  0.09125379784995667,
+  0.013872106909816257,
+  -0.0008786208359828815,
+  0.0011993845621092196,
+  -0.01645080326288375,
+  -0.09367947263571219,
+  -0.10827006684348266,
+  -0.07520301291634655,
+  -0.04198514892887826,
+  -0.021290356584896874,
+  -0.010200991240527542,
+};
+static const float c3_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0007896351019423816,
+  0.0018718593077865326,
+  0.004259190313167949,
+  0.008784166436796144,
+  0.014228201960903939,
+  0.009727536748893095,
+  -0.01721317464724529,
+  -0.023762851116001377,
+  -0.02424226654277249,
+  -0.01604104065157868,
+  0.010376786273973133,
+  0.014122038833203628,
+  0.008641365746408176,
+  0.004176981844803722,
+  0.0018557930308154783,
+  0.0007890167735032168,
+};
+static const float c4_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  2.3213858349988003e-05,
+  6.232838199801025e-05,
+  0.0001632037964535633,
+  0.0003928983460811959,
+  0.0007341577078787206,
+  0.0003053082875419616,
+  -0.003254838747910248,
+  -0.004021655986643196,
+  0.004258314078650583,
+  0.0030578644020607566,
+  -0.00037014803880675387,
+  -0.0007265964578827031,
+  -0.0003849331969038772,
+  -0.00015947916435728337,
+  -6.171511304866758e-05,
+  -2.319341439172678e-05,
+};
+
+/**
+ * @brief       Polynomial approximation of x/(exp(-x)+1.0) function.
+ * @param[in]   input   Input array of elements in IEEE 32-bit floating-point
+ * format.
+ * @param[out]  output  Output array of elements in IEEE 32-bit floating-point
+ * format.
+ * @param[in]   length  Number of elements in input/output arrays.
+ * @return      Returns 0 on successful execution. Otherwise -1.
+ */
+int32_t hvx_silu_af(float *restrict input, float *restrict output,
+                    uint32_t size) {
+  HVX_Vector *input_v_ptr;
+  HVX_UVector *output_v_ptr;
+  HVX_Vector input_min_v_f;
+  HVX_Vector input_shifted_v_qf32;
+  HVX_Vector input_scaled_v_qf32;
+  HVX_Vector scale_v;
+  HVX_Vector input_v_qf32;
+  HVX_Vector const16_0_v_sf;
+  HVX_Vector zero_v_sf;
+  HVX_Vector mask_idx1_v, mask_idx2_v;
+  HVX_Vector tmp_v, idx1_v, idx2_v;
+  HVX_Vector output_v;
+  HVX_Vector slinep;
+  HVX_Vector slinec;
+  HVX_Vector sline;
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 31;
+  int32_t vectors_in_rounddown = size / 32;
+  int32_t leftover_size = leftover * sizeof(float);
+  HVX_DV c0_coeff_dv;
+  HVX_VectorPair c0_coeff_vp;
+  HVX_Vector c0_coeff_v;
+  HVX_DV c1_coeff_dv;
+  HVX_VectorPair c1_coeff_vp;
+  HVX_Vector c1_coeff_v;
+  HVX_DV c2_coeff_dv;
+  HVX_VectorPair c2_coeff_vp;
+  HVX_Vector c2_coeff_v;
+  HVX_DV c3_coeff_dv;
+  HVX_VectorPair c3_coeff_vp;
+  HVX_Vector c3_coeff_v;
+  HVX_DV c4_coeff_dv;
+  HVX_VectorPair c4_coeff_vp;
+  HVX_Vector c4_coeff_v;
+
+  HVX_Vector f8, f_8;
+
+  /* Check input arguments. Return error status if some argument has invalid
+   * value */
+  if ((input == 0) || (output == 0) || (size == 0)) {
+    return -1;
+  }
+
+  input_v_ptr = (HVX_Vector *)input;
+  output_v_ptr = (HVX_UVector *)output;
+
+  f8 = Q6_V_vsplat_R(float_to_bits(8.0f));
+  f_8 = Q6_V_vsplat_R(float_to_bits(-8.0f));
+
+  /*
+   * If input data is not aligned to HVX vector size, compose aligned vectors
+   * from data loaded in slinep and slinec
+   */
+  slinep = *input_v_ptr++;
+
+  /*
+   * Splat scale factor in order to be used later for finding indexes of
+   * coefficients. Scale factor is represented in IEEE 16-bit floating-point
+   * format and it is calculated using the following formula: scale_factor =
+   * (16.0 / (b0 - a0)) NOTE: Calculated value is slightly decreased in order to
+   * avoid out of bound indexes during VLUT lookup.
+   */
+  scale_v = Q6_V_vsplat_R(0x3f7ffffe);
+
+  /*
+   * Vector of zeroes used as neutral element in sf to qf32 conversions.
+   * NOTE: Some of conversions (i.e conversion of scale factor and coefficients)
+   *       can be avoided in real-time, but this is not done in order to don't
+   *       sacrify code readibility in expense of insignificant performance
+   * improvement.
+   */
+  zero_v_sf = Q6_V_vzero();
+
+  /* Mask for extracting only 4 bits of mantissa */
+  mask_idx1_v = Q6_V_vsplat_R(0x0000000F);
+  mask_idx2_v = Q6_V_vsplat_R(0x00000010);
+
+  /* 16.0 in IEEE 16-bit floating-point representation */
+  const16_0_v_sf = Q6_V_vsplat_R(0x41800000);
+
+  /*
+   * Prepare vector of input_min values, that is used later in shifting input
+   * range. input_min is low boundary of specified input range.
+   */
+  input_min_v_f = Q6_V_vsplat_R(0xc1000000);
+
+  /* Convert scale factor from sf to q32. Use the same vector for both formats
+   */
+  scale_v = Q6_Vqf32_vadd_VsfVsf(scale_v, zero_v_sf);
+
+  /* Load coefficients */
+  c0_coeff_v = *((HVX_Vector *)(c0_coeffs));
+  c1_coeff_v = *((HVX_Vector *)(c1_coeffs));
+  c2_coeff_v = *((HVX_Vector *)(c2_coeffs));
+  c3_coeff_v = *((HVX_Vector *)(c3_coeffs));
+  c4_coeff_v = *((HVX_Vector *)(c4_coeffs));
+
+  /* Convert coefficients from sf to qf32 format. Use the same vector for both
+   * representations */
+  c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_sf);
+  c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_sf);
+  c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_sf);
+  c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_sf);
+  c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_sf);
+
+  /* Split 32-bit coefficients to lower and upper part in order to obtain them
+   * later with VLUT16. */
+  c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v);
+  c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v);
+  c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v);
+  c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v);
+  c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v);
+
+  /*
+   * Handle number of whole vectors in input data.
+   * Don't process last vector in order to avoid out-of-boundary load.
+   */
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    /* Process one vector at a time */
+    for (int32_t j = 0; j < block; ++j) {
+      slinec = *input_v_ptr++;
+
+      /* Compose vector of input data from slinec and slinep */
+      sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+
+      /* Shift input range from [input_min, input_max] to [0, input_max -
+       * input_min] */
+      input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+      /*
+       * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+       * in order to get corresponding coefficient indexes
+       */
+      input_scaled_v_qf32 =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+      /*
+       * VLUT 16 requires integer indexes. Shift scaled input range from
+       * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+       * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+       * have the same exponent, which means 4 MSB of mantissa carry information
+       * about integer index.
+       */
+      input_scaled_v_qf32 =
+        Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+      /* Convert back from qf32 to sf in order to extract integer index */
+      tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+      /* Only 4 MSB bits of mantissa represent segment index */
+      idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+      idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+      idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+      idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+      /* Obtain the polynomial coefficients from lookup table */
+      c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+      c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c0_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c1_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c2_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c3_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+      /* Convert input from sf vector to qf32 vector for Horner's method*/
+      input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+      /* Perform evaluation of polynomial using Horner's method */
+      output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32);
+      output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+      output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+      output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+      output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+      output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+      output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+      output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+      // x * sigmod
+      output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v);
+
+      HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v);
+
+      HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8);
+      out_v = Q6_V_vmux_QVV(islf8, sline, out_v);
+
+      HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline);
+      out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v);
+
+      /* Store results to the output buffer and convert from qf32 to sf */
+      *((HVX_UVector *)(output_v_ptr++)) = out_v;
+
+      /* Prepare slinep for next iteration */
+      slinep = slinec;
+    }
+  }
+
+  /* Handle last whole vector from input data */
+  if (vectors_in_rounddown > 0) {
+    slinec =
+      is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : *input_v_ptr++;
+    sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+
+    /* Shift input range from [input_min, input_max] to [0, input_max -
+     * input_min] */
+    input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+    /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+    input_scaled_v_qf32 =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+    /*
+     * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+     * to [16.0,32.0) in order to convert float indexes to integer values.
+     * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+     * same exponent, which means 4 MSB of mantissa carry information about
+     * integer index.
+     */
+    input_scaled_v_qf32 =
+      Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+    /* Convert back from qf32 to sf in order to extract integer index */
+    tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+    /* Only 4 MSB bits of mantissa represent segment index */
+    idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+    /* Ensure only 4 MSB bits of mantissa are used as indexes */
+    idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+    idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+    idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+    /* Obtain the polynomial coefficients from lookup table */
+    c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+    c0_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+    c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+    c1_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+    c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+    c2_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+    c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+    c3_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+    c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+    c4_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+    /* Convert input from sf vector to qf32 vector for Horner's method*/
+    input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+    /* Perform evaluation of polynomial using Horner's method */
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+    // x * sigmod
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v);
+
+    HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v);
+
+    HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8);
+    out_v = Q6_V_vmux_QVV(islf8, sline, out_v);
+
+    HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline);
+    out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v);
+
+    /* Convert from qf32 to sf, store output and go to handle leftover */
+    *((HVX_UVector *)(output_v_ptr++)) = out_v;
+
+    slinep = slinec;
+  }
+
+  /* Handle leftover elements */
+  if (leftover > 0) {
+    slinec =
+      (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep
+                                                         : *input_v_ptr++);
+
+    sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+
+    /* Shift input range from [input_min, input_max] to [0, input_max -
+     * input_min] */
+    input_shifted_v_qf32 = Q6_Vqf32_vsub_VsfVsf(sline, input_min_v_f);
+
+    /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+    input_scaled_v_qf32 =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(input_shifted_v_qf32, scale_v);
+
+    /*
+     * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+     * to [16.0,32.0) in order to convert float indexes to integer values.
+     * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+     * same exponent, which means 4 MSB of mantissa carry information about
+     * integer index.
+     */
+    input_scaled_v_qf32 =
+      Q6_Vqf32_vadd_Vqf32Vsf(input_scaled_v_qf32, const16_0_v_sf);
+
+    /* Convert back from qf32 to sf in order to extract integer index */
+    tmp_v = Q6_Vsf_equals_Vqf32(input_scaled_v_qf32);
+
+    /* Only 4 MSB bits of mantissa represent segment index */
+    idx1_v = Q6_Vuw_vlsr_VuwR(tmp_v, 19);
+
+    /* Ensure only 4 MSB bits of mantissa are used as indexes */
+    idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+    idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+    idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+    /* Obtain the polynomial coefficients from lookup table */
+    c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+    c0_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+    c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+    c1_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+    c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+    c2_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+    c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+    c3_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+    c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+    c4_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+    /* Convert input from sf vector to qf32 vector for Horner's method*/
+    input_v_qf32 = Q6_Vqf32_vadd_VsfVsf(sline, zero_v_sf);
+
+    /* Perform evaluation of polynomial using Horner's method */
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c3_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c2_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c1_coeff_vp));
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(output_v, input_v_qf32);
+    output_v = Q6_Vqf32_vadd_Vqf32Vqf32(output_v, Q6_V_lo_W(c0_coeff_vp));
+
+    // x * sigmod
+    output_v = Q6_Vqf32_vmpy_Vqf32Vqf32(input_v_qf32, output_v);
+
+    HVX_Vector out_v = Q6_Vsf_equals_Vqf32(output_v);
+
+    HVX_VectorPred islf8 = Q6_Q_vcmp_gt_VsfVsf(sline, f8);
+    out_v = Q6_V_vmux_QVV(islf8, sline, out_v);
+
+    HVX_VectorPred islf_8 = Q6_Q_vcmp_gt_VsfVsf(f_8, sline);
+    out_v = Q6_V_vmux_QVV(islf_8, zero_v_sf, out_v);
+
+    /* Store output */
+    vstu_variable(output_v_ptr, leftover_size, out_v);
+  }
+
+  return 0;
+}
+
+static const float fp16_c0_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.13239719960243818,
+  0.2216255210749415,
+  0.3447664743728659,
+  0.48137452032585476,
+  0.5716299228719798,
+  0.5547323231605259,
+  0.5046287748870234,
+  0.4999985574626892,
+  0.5000036514755082,
+  0.49475652448004626,
+  0.4441393352532763,
+  0.428500379952032,
+  0.5173297285470642,
+  0.6541461039833616,
+  0.7783931007462818,
+  0.8678015179911097,
+};
+static const float fp16_c1_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.05928005756790343,
+  0.11063222460270064,
+  0.1932879057003057,
+  0.30302440212086995,
+  0.3922924462181049,
+  0.36546332659415875,
+  0.2644148210990377,
+  0.24989020912329707,
+  0.2498532691910313,
+  0.2661055781198988,
+  0.36728015359480604,
+  0.39215270010450015,
+  0.3041825601732039,
+  0.1940762094668647,
+  0.11061794856987572,
+  0.059174800917353595,
+};
+static const float fp16_c2_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.010145494303219278,
+  0.02123968384425681,
+  0.04207468332514667,
+  0.07519946712591977,
+  0.10840620196267145,
+  0.09270738184406795,
+  0.015322371881818012,
+  -0.0009948273994921822,
+  0.0011544907060402412,
+  -0.017040517565094934,
+  -0.09379878876657094,
+  -0.10835043868732394,
+  -0.07558705272699548,
+  -0.04228875316413285,
+  -0.021235740718738055,
+  -0.010124599879590107,
+};
+static const float fp16_c3_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0007841223015974933,
+  0.001850453397354219,
+  0.004187899308371771,
+  0.008640952434084206,
+  0.01414741414964877,
+  0.010117749275618,
+  -0.01654848996354919,
+  -0.02395108399453624,
+  -0.024199111971064446,
+  -0.015783556879607072,
+  0.010407672131558174,
+  0.014137608186323335,
+  0.008698510795258909,
+  0.004213708431213342,
+  0.0018499827774393985,
+  0.0007822799742289481,
+};
+static const float fp16_c4_coeffs[32] __attribute__((aligned(VLEN))) = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  2.3031641204975905e-05,
+  6.150442488966733e-05,
+  0.00015997783736818624,
+  0.00038491646239693526,
+  0.0007283649599237781,
+  0.00034439150914392054,
+  -0.003142246198646662,
+  -0.004120389580321761,
+  0.004246050162553198,
+  0.0030162727520777893,
+  -0.00037312974308425725,
+  -0.0007277242855014247,
+  -0.00038811687679772674,
+  -0.0001611434776868886,
+  -6.14837984586862e-05,
+  -2.297076123375133e-05,
+};
+
+/**
+ * @brief       Polynomial approximation of 1.0/(exp(-x)+1.0) function.
+ * @param[in]   input   Input array of elements in IEEE 16-bit floating-point
+ * format.
+ * @param[out]  output  Output array of elements in IEEE 16-bit floating-point
+ * format.
+ * @param[in]   length  Number of elements in input/output arrays.
+ * @return      Returns 0 on successful execution. Otherwise -1.
+ */
+int32_t hvx_silu_ahf(__fp16 *restrict input, __fp16 *restrict output,
+                     uint32_t size) {
+  HVX_Vector *input_v_ptr;
+  HVX_UVector *output_v_ptr;
+  HVX_Vector input_min_v_hf;
+  HVX_Vector input_shifted_v_hf;
+  HVX_Vector input_scaled_v;
+  HVX_VectorPair input_vp_qf32;
+  // HVX_Vector input_v_qf16;
+  HVX_Vector mask_idx1_v, mask_idx2_v;
+  HVX_Vector const16_0_v_hf;
+  HVX_Vector zero_v_hf, one_v_hf;
+  HVX_Vector tmp_v;
+  HVX_Vector idx1_v, idx2_v;
+  HVX_Vector scale_v;
+  HVX_DV output_dv;
+  // HVX_Vector output_v;
+  HVX_Vector slinep, slinec, sline;
+  HVX_Vector sout;
+  int32_t block, l2fetch_block;
+  int32_t leftover = size & 63;
+  int32_t vectors_in_rounddown = size / 64;
+  int32_t leftover_size = leftover * sizeof(__fp16);
+  HVX_DV c0_coeff_dv;
+  HVX_VectorPair c0_coeff_vp;
+  HVX_Vector c0_coeff_v;
+  HVX_DV c1_coeff_dv;
+  HVX_VectorPair c1_coeff_vp;
+  HVX_Vector c1_coeff_v;
+  HVX_DV c2_coeff_dv;
+  HVX_VectorPair c2_coeff_vp;
+  HVX_Vector c2_coeff_v;
+  HVX_DV c3_coeff_dv;
+  HVX_VectorPair c3_coeff_vp;
+  HVX_Vector c3_coeff_v;
+  HVX_DV c4_coeff_dv;
+  HVX_VectorPair c4_coeff_vp;
+  HVX_Vector c4_coeff_v;
+
+  /* Check input arguments. Return error status if some argument has invalid
+   * value */
+  if ((input == 0) || (output == 0) || (size == 0)) {
+    return -1;
+  }
+
+  input_v_ptr = (HVX_Vector *)input;
+  output_v_ptr = (HVX_UVector *)output;
+
+  /*
+   * If input data is not aligned to HVX vector size, compose aligned vectors
+   * from data loaded in slinep and slinec
+   */
+  slinep = *input_v_ptr++;
+
+  /*
+   * Splat scale factor in order to be used later for finding indexes of
+   * coefficients. Scale factor is represented in IEEE 16-bit floating-point
+   * format and it is calculated using the following formula: scale_factor =
+   * (convert_sf_to_hf) (16.0 / (b0 - a0)) NOTE: Calculated value is slightly
+   * decreased in order to avoid out of bound indexes during VLUT lookup.
+   */
+  scale_v = Q6_Vh_vsplat_R(0x3bfe);
+
+  /* Vector of ones used as mpy neutral element in conversions from hf vector to
+   * qf32 vector pair */
+  one_v_hf = Q6_Vh_vsplat_R(0x3c00);
+
+  /*
+   * Vector of zeroes used as neutral element in hf to qf16 conversions.
+   * NOTE: Some of conversions (i.e conversion of scale factor and coefficients)
+   *       can be avoided in real-time, but this is not done in order to don't
+   *       sacrify code readibility in expense of insignificant performance
+   * improvement.
+   */
+  zero_v_hf = Q6_V_vzero();
+
+  /* Mask for extracting only 4 bits of mantissa */
+  mask_idx1_v = Q6_Vh_vsplat_R(0x000F);
+
+  mask_idx2_v = Q6_V_vsplat_R(0x00001010);
+
+  /* 16.0 in IEEE 16-bit floating-point representation */
+  const16_0_v_hf = Q6_Vh_vsplat_R(0x4c00);
+
+  /*
+   * Prepare vector of input_min values, that is used later in shifting input
+   * range. input_min is low boundary of specified input range.
+   */
+  input_min_v_hf = Q6_Vh_vsplat_R(0xc800);
+
+  /* Convert scale factor from hf to q16. Use the same vector for both formats
+   */
+  scale_v = Q6_Vqf16_vadd_VhfVhf(scale_v, zero_v_hf);
+
+  /* Load coefficients */
+  c0_coeff_v = *((HVX_Vector *)(fp16_c0_coeffs));
+  c1_coeff_v = *((HVX_Vector *)(fp16_c1_coeffs));
+  c2_coeff_v = *((HVX_Vector *)(fp16_c2_coeffs));
+  c3_coeff_v = *((HVX_Vector *)(fp16_c3_coeffs));
+  c4_coeff_v = *((HVX_Vector *)(fp16_c4_coeffs));
+
+  /* Convert coefficients from hf to qf32 format. Use the same vector for both
+   * representations */
+  c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_hf);
+  c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_hf);
+  c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_hf);
+  c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_hf);
+  c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_hf);
+
+  /* Split 32-bit coefficients to lower and upper part in order to obtain them
+   * later with VLUT16. */
+  c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v);
+  c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v);
+  c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v);
+  c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v);
+  c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v);
+
+  /*
+   * Handle number of whole vectors in input data.
+   * Don't process last vector in order to avoid out-of-boundary load.
+   */
+  for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) {
+    block = Q6_R_min_RR(i, BLOCK_SIZE);
+    l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE);
+
+    if (l2fetch_block > 0) {
+      l2fetch(input_v_ptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0);
+    }
+
+    /* Process one vector at a time */
+    for (int32_t j = 0; j < block; ++j) {
+      slinec = *input_v_ptr++;
+
+      /* Compose vector of input data from slinec and slinep */
+      sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+      tmp_v = Q6_Vh_vdeal_Vh(sline);
+
+      /* Shift input range from [input_min, input_max] to [0, input_max -
+       * input_min] */
+      input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+      /*
+       * Scale shifted input range from [0, input_max - input_min] to [0,16.0)
+       * in order to get corresponding coefficient indexes
+       */
+      input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+      /*
+       * VLUT 16 requires integer indexes. Shift scaled input range from
+       * [0,16.0) to [16.0,32.0) in order to convert float indexes to integer
+       * values. Float values, represented in IEEE 754, in range [16.0,32.0]
+       * have the same exponent, which means 4 MSB of mantissa carry information
+       * about integer index. Use the same input_scaled_v vector for hf and qf16
+       * representation
+       */
+      input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+      /* Convert back from qf16 to hf in order to extract integer index  */
+      tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+      /* Only 4 MSB bits of mantissa represent segment index */
+      idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+      /* Ensure only 4 MSB bits of mantissa are used as indexes */
+      idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+      idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+      idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+      idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+      /* Obtain the polynomial coefficients from lookup table */
+      c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+      c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c0_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+      c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c1_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+      c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c2_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+      c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c3_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+      c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v,
+                                           Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+      /* Convert input from hf vector to qf32 vector pair for Horner's method*/
+      input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf);
+
+      /* Perform evaluation of polynomial using Horner's method */
+      output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                                Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+      output_dv.V.lo =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+      output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                                Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+      output_dv.V.hi =
+        Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+      // input_v_qf16 = Q6_Vqf16_vmpy_VhfVhf(sline, one_v_hf);
+
+      // output_v = Q6_Vqf16_vmpy_Vqf16Vhf(input_v_qf16,
+      // Q6_Vhf_equals_Wqf32(c4_coeff_vp)); output_v =
+      // Q6_Vqf16_vadd_Vqf16Vhf(output_v, Q6_Vhf_equals_Wqf32(c3_coeff_vp));
+      // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16);
+      // output_v = Q6_Vqf16_vadd_Vqf16Vhf(output_v,
+      // Q6_Vhf_equals_Wqf32(c2_coeff_vp)); output_v =
+      // Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16); output_v =
+      // Q6_Vqf16_vadd_Vqf16Vhf(output_v, Q6_Vhf_equals_Wqf32(c1_coeff_vp));
+      // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16);
+      // output_v = Q6_Vqf16_vadd_Vqf16Vhf(output_v,
+      // Q6_Vhf_equals_Wqf32(c0_coeff_vp));
+
+      // x * sigmod
+      output_dv.V.lo =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo);
+      output_dv.V.hi =
+        Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+      /* Store results to the output buffer and convert from qf16 to hf */
+      *output_v_ptr++ = Q6_Vhf_equals_Wqf32(output_dv.VV);
+      // output_v = Q6_Vqf16_vmpy_Vqf16Vqf16(output_v, input_v_qf16);
+      // *output_v_ptr++ = Q6_Vhf_equals_Vqf16(output_v);
+
+      /* Prepare slinep for next iteration */
+      slinep = slinec;
+    }
+  }
+
+  /* Handle last whole vector from input data */
+  if (vectors_in_rounddown > 0) {
+    slinec =
+      is_aligned(input_v_ptr, VLEN) && leftover == 0 ? slinep : *input_v_ptr++;
+    sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+    tmp_v = Q6_Vh_vdeal_Vh(sline);
+    /* Shift input range from [input_min, input_max] to [0, input_max -
+     * input_min] */
+    input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+    /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+    input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+    /*
+     * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+     * to [16.0,32.0) in order to convert float indexes to integer values.
+     * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+     * same exponent, which means 4 MSB of mantissa carry information about
+     * integer index.
+     * Use the same input_scaled_v vector for hf and qf16 representation
+     */
+    input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+    /* Convert back from qf16 to hf in order to extract integer index */
+    tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+    /* Only 4 MSB bits of mantissa represent segment index */
+    idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+    /* Ensure only 4 MSB bits of mantissa are used as indexes */
+    idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+    idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+    idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+    idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+    /* Obtain the polynomial coefficients from lookup table */
+    c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+    c0_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+    c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+    c1_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+    c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+    c2_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+    c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+    c3_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+    c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+    c4_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+    /* Convert input from hf vector to qf32 vector pair for Horner's method*/
+    input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf);
+
+    /* Perform evaluation of polynomial using Horner's method */
+    output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                              Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+    output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                              Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+    // x * sigmod
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo);
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+    /* Convert from qf32 to hf, store output and go to handle leftover */
+    *output_v_ptr++ = Q6_Vhf_equals_Wqf32(output_dv.VV);
+
+    slinep = slinec;
+  }
+
+  /* Handle leftover elements */
+  if (leftover > 0) {
+    slinec =
+      (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep
+                                                         : *input_v_ptr++);
+
+    sline = Q6_V_valign_VVR(slinec, slinep, (size_t)input);
+    tmp_v = Q6_Vh_vdeal_Vh(sline);
+    /* Shift input range from [input_min, input_max] to [0, input_max -
+     * input_min] */
+    input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf);
+
+    /* Scale shifted input range from [0, input_max - input_min] to [0,16.0) */
+    input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v);
+
+    /*
+     * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0)
+     * to [16.0,32.0) in order to convert float indexes to integer values.
+     * Float values, represented in IEEE 754, in range [16.0,32.0] have the
+     * same exponent, which means 4 MSB of mantissa carry information about
+     * integer index.
+     * Use the same input_scaled_v vector for hf and qf16 representation
+     */
+    input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf);
+
+    /* Convert back from qf16 to hf in order to extract integer index */
+    tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v);
+
+    /* Only 4 MSB bits of mantissa represent segment index */
+    idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6);
+
+    /* Ensure only 4 MSB bits of mantissa are used as indexes */
+    idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v);
+
+    idx1_v = Q6_Vb_vshuff_Vb(idx1_v);
+    idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v);
+    idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16);
+
+    /* Obtain the polynomial coefficients from lookup table */
+    c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1);
+    c0_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1);
+    c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1);
+    c1_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1);
+    c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1);
+    c2_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1);
+    c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1);
+    c3_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1);
+    c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1);
+    c4_coeff_vp =
+      Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1);
+
+    /* Convert input from hf vector to qf32 vector pair for Horner's method*/
+    input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline, one_v_hf);
+
+    /* Perform evaluation of polynomial using Horner's method */
+    output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp),
+                                              Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp));
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32));
+    output_dv.V.lo =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp));
+
+    output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp),
+                                              Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp));
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32));
+    output_dv.V.hi =
+      Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp));
+
+    // x * sigmod
+    output_dv.V.lo =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo);
+    output_dv.V.hi =
+      Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi);
+
+    /* Convert from qf16 to hf */
+    sout = Q6_Vhf_equals_Wqf32(output_dv.VV);
+
+    /* Store output */
+    vstu_variable(output_v_ptr, leftover_size, sout);
+  }
+
+  return 0;
+}
+
+#endif
+
+template <typename TensorType>
+GraphStatus siluImpl(TensorType &out_0, const TensorType &in_0)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+#ifdef REFERENCE_OP
+  debuglog("silu execute... inval=(%d)", in_0.get_dtype());
+  debuglog("silu execute... inval=(%d)", out_0.get_dtype());
+
+  out_0.set_dims(in_0);
+  // NHWC
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // SiLU
+        for (Idx d = 0; d < d_in; d++) {
+          float inval = in_0(b, h, w, d);
+          float outval = 1 / (1 + expf(-inval));
+
+          debuglog("silu execute... inval=(%f)", inval);
+          debuglog("silu execute... outval=(%f)", outval);
+
+          out_0(b, h, w, d) = inval * outval;
+        }
+      }
+    }
+  }
+
+#else
+
+  // HVX Method -- FP32 Version
+  out_0.set_dims(in_0);
+
+  DType dtype = in_0.get_dtype();
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+  size_t size = b_in * h_in * w_in * d_in;
+
+  // Noticable size >= 128
+
+  // SiLU  inval / (1 + expf(-inval));
+  // sigmod 1.0/(exp(-x)+1.0)
+  // SiLU   inval * sigmod
+
+  if (dtype == DType::Float16) {
+
+    // NHWC
+    auto in_ptr = (__fp16 *)in_0.raw_data_const();
+    auto out_ptr = (__fp16 *)out_0.raw_data();
+    hvx_silu_ahf(in_ptr, out_ptr, size);
+
+  } else {
+    // NHWC
+    auto in_ptr = (float *)in_0.raw_data_const();
+    auto out_ptr = (float *)out_0.raw_data();
+    hvx_silu_af(in_ptr, out_ptr, size);
+  }
+
+  return GraphStatus::Success;
+
+#endif
+
+#ifdef DEBUG
+
+  for (Idx b = 0; b < b_in; b++) {
+    for (Idx h = 0; h < h_in; h++) {
+      for (Idx w = 0; w < w_in; w++) {
+        // SiLU
+        for (Idx d = 0; d < d_in; d++) {
+          float out_value = out_0(b, h, w, d);
+          debuglog("silu execute... outval=(%f)", out_value);
+        }
+      }
+    }
+  }
+
+#endif
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float siluCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_SiLU);
diff --git a/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp
new file mode 100644
index 000000000..832420ca1
--- /dev/null
+++ b/nntrainer/npu/qnn/LLaMAPackage/src/ops/SplitInput.cpp
@@ -0,0 +1,154 @@
+//==============================================================================
+// Auto Generated Code for LLaMAPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_SplitInput);
+
+// op execute function declarations
+template <typename TensorType, typename TensorType1>
+GraphStatus splitinputImpl(TensorType &out_0, TensorType &out_1,
+                           const TensorType &in_0, const TensorType1 &in_1,
+                           const Tensor &num);
+
+// forward declaration of sample cost function
+static float splitinputCostFunc(const Op *op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((splitinputImpl<Tensor, Tensor>), "SplitInput")
+ */
+DEF_PACKAGE_OP((splitinputImpl<Tensor, Tensor>), "SplitInput")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((splitinputImpl<PlainFloatTensor,
+ * PlainFloatTensor>), "SplitInput", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((splitinputImpl<PlainFloatTensor,
+ * PlainFloatTensor>), "SplitInput", splitinputCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+template <typename TensorType, typename TensorType1>
+GraphStatus splitinputImpl(TensorType &out_0, TensorType &out_1,
+                           const TensorType &in_0, const TensorType1 &in_1,
+                           const Tensor &num) {
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+
+  // default is two.
+
+  size_t o_size = in_1(0, 0, 0, 0);
+  size_t x_size = in_1(0, 0, 0, 1);
+
+  auto [b_in, h_in, w_in, d_in] = in_0.dims();
+
+  const size_t dims_0[] = {b_in, o_size, w_in, d_in};
+  const size_t dims_1[] = {b_in, x_size, w_in, d_in};
+
+  out_0.set_dims(dims_0);
+  out_1.set_dims(dims_1);
+
+  DType dtype = in_0.get_dtype();
+  uint32_t bitwidth = 4;
+
+  if (dtype == DType::QUInt8 || dtype == DType::QInt8) {
+
+    bitwidth = 1;
+
+  } else if (dtype == DType::Float16) {
+
+    bitwidth = 2;
+  } else if (dtype == DType::Float32) {
+
+    bitwidth = 4;
+  }
+
+  const uint8_t *in_ptr = (uint8_t *)in_0.raw_data_const();
+
+  uint8_t *out_ptr_0 = (uint8_t *)out_0.raw_data();
+  uint8_t *out_ptr_1 = (uint8_t *)out_1.raw_data();
+
+  memcpy(out_ptr_0, in_ptr, b_in * o_size * w_in * d_in * bitwidth);
+  in_ptr += b_in * o_size * w_in * d_in * bitwidth;
+
+  memcpy(out_ptr_1, in_ptr, b_in * x_size * w_in * d_in * bitwidth * 4);
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float splitinputCostFunc(const Op *op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_SplitInput);
diff --git a/nntrainer/npu/qnn/Model/QnnModel.cpp b/nntrainer/npu/qnn/Model/QnnModel.cpp
new file mode 100644
index 000000000..41c368154
--- /dev/null
+++ b/nntrainer/npu/qnn/Model/QnnModel.cpp
@@ -0,0 +1,668 @@
+//==============================================================================
+//
+//  Copyright (c) 2019-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <numeric>
+
+#include "QnnModel.hpp"
+#include "QnnModelPal.hpp"
+#include "QnnTypeMacros.hpp"
+
+#define FREE_MEMORY(ptr1, ptr2, ptr3) \
+  do {                                \
+    free(ptr1);                       \
+    free(ptr2);                       \
+    free(ptr3);                       \
+  } while (0)
+
+namespace qnn_wrapper_api {
+
+ModelError_t QnnModel::initialize(const Qnn_BackendHandle_t &backendHandle,
+                                  const QNN_INTERFACE_VER_TYPE &qnnInterface,
+                                  const Qnn_ContextHandle_t &context,
+                                  const char *graphName, bool debug,
+                                  uint8_t doNodeValidations,
+                                  const QnnGraph_Config_t **graphConfigs) {
+  if (backendHandle == nullptr) {
+    PRINT_ERROR("QnnModel::initialize() nullptr passed as backend handle.");
+    return MODEL_CONTEXT_ERROR;
+  }
+  if (context == nullptr) {
+    PRINT_ERROR("QnnModel::initialize() nullptr passed as context handle.");
+    return MODEL_CONTEXT_ERROR;
+  }
+  if (graphName == nullptr) {
+    PRINT_ERROR("QnnModel::initialize() nullptr passed as graphName.");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  if (!m_graphName.empty()) {
+    // only one graph is allowed per QnnModel
+    PRINT_ERROR(
+      "QnnModel::initialize() model for graph %s already initialized.",
+      graphName);
+    return MODEL_GRAPH_ERROR;
+  }
+
+  if (!m_doNodeValidations) {
+    PRINT_WARNING("Node validation disabled. Backend will not perform op "
+                  "validation prior to adding Node. \n");
+  }
+
+  m_qnnInterface = qnnInterface;
+  m_backendHandle = backendHandle;
+  m_graphName = graphName;
+  m_debug = debug;
+  m_doNodeValidations = doNodeValidations;
+
+  if (m_qnnInterface.graphCreate(context, graphName, graphConfigs, &m_graph) !=
+        QNN_GRAPH_NO_ERROR ||
+      m_graph == nullptr) {
+    PRINT_ERROR(
+      "QnnModel::initialize() not able to create graph in given context.");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::addTensor(const char *nodeName, Qnn_Tensor_t *tensor,
+                                 bool saveTensor) {
+  ModelError_t err;
+  if (!tensor) {
+    PRINT_ERROR("QnnModel::addTensor() NULL tensor pointer provided.\n");
+    return MODEL_TENSOR_ERROR;
+  }
+  VALIDATE_TENSOR_VERSION((*tensor), err);
+
+  // Verify tensor being added is not a duplicate
+  std::string mapEntry = std::string(QNN_TENSOR_GET_NAME(tensor));
+  if (m_modelTensorsMap.find(mapEntry) != m_modelTensorsMap.end()) {
+    PRINT_ERROR("QnnModel::addTensor() creating tensor %s for node %s. Tensor "
+                "already exists.\n",
+                mapEntry.c_str(), nodeName);
+
+    return MODEL_TENSOR_ERROR;
+  }
+
+  const std::map<Qnn_DataType_t, size_t> dataTypeToSize = {
+    {QNN_DATATYPE_INT_8, 1},           {QNN_DATATYPE_INT_16, 2},
+    {QNN_DATATYPE_INT_32, 4},          {QNN_DATATYPE_INT_64, 8},
+    {QNN_DATATYPE_UINT_8, 1},          {QNN_DATATYPE_UINT_16, 2},
+    {QNN_DATATYPE_UINT_32, 4},         {QNN_DATATYPE_UINT_64, 8},
+    {QNN_DATATYPE_FLOAT_16, 2},        {QNN_DATATYPE_FLOAT_32, 4},
+    {QNN_DATATYPE_BOOL_8, 1},          {QNN_DATATYPE_SFIXED_POINT_8, 1},
+    {QNN_DATATYPE_SFIXED_POINT_16, 2}, {QNN_DATATYPE_SFIXED_POINT_32, 4},
+    {QNN_DATATYPE_UFIXED_POINT_8, 1},  {QNN_DATATYPE_UFIXED_POINT_16, 2},
+    {QNN_DATATYPE_UFIXED_POINT_32, 4},
+  };
+
+  if (dataTypeToSize.find(QNN_TENSOR_GET_DATA_TYPE(tensor)) ==
+      dataTypeToSize.end()) {
+    PRINT_ERROR("QnnModel::addTensor() invalid QNN data type provided, %u, for "
+                "tensor %s on node %s\n",
+                QNN_TENSOR_GET_DATA_TYPE(tensor), QNN_TENSOR_GET_NAME(tensor),
+                nodeName);
+    return MODEL_TENSOR_ERROR;
+  }
+
+  // sanity check tensor data if addTensor used for static tensor
+  if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_STATIC) {
+    if (QNN_TENSOR_GET_MEM_TYPE(tensor) != QNN_TENSORMEMTYPE_RAW) {
+      PRINT_ERROR("QnnModel::addTensor(): Expected raw memType in provided "
+                  "static tensor %s for node %s",
+                  mapEntry.c_str(), nodeName);
+      return MODEL_TENSOR_ERROR;
+    }
+    // verify size expressed by the dims matches the raw tensor size
+    uint32_t qnnTensorSize = std::accumulate(
+      QNN_TENSOR_GET_DIMENSIONS(tensor),
+      QNN_TENSOR_GET_DIMENSIONS(tensor) + QNN_TENSOR_GET_RANK(tensor),
+      (uint32_t)dataTypeToSize.find(QNN_TENSOR_GET_DATA_TYPE(tensor))->second,
+      std::multiplies<uint32_t>());
+    if (qnnTensorSize != QNN_TENSOR_GET_CLIENT_BUF(tensor).dataSize) {
+      PRINT_ERROR("QnnModel::addTensor(): Adding STATIC tensor, length "
+                  "mismatch between clientBuf"
+                  "size and tensor Dims(dim * rank * sizeof(datatype) for, "
+                  "nodeName: %s, tensorName: %s."
+                  "Got tensorSize: %d, tensor.clientBuf.dataSize: %d.\n",
+                  nodeName, QNN_TENSOR_GET_NAME(tensor), qnnTensorSize,
+                  QNN_TENSOR_GET_CLIENT_BUF(tensor).dataSize);
+      return MODEL_TENSOR_ERROR;
+    }
+  }
+
+  if (m_debug && QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_NATIVE) {
+    // for debug, make all tensors accessible by client
+    QNN_TENSOR_SET_TYPE(tensor, QNN_TENSOR_TYPE_APP_READ);
+  }
+
+  if (m_qnnInterface.tensorCreateGraphTensor(m_graph, tensor) !=
+      QNN_TENSOR_NO_ERROR) {
+    PRINT_ERROR(
+      "QnnModel::addTensor() Creating tensor for node: %s, tensorName: %s.\n",
+      nodeName, QNN_TENSOR_GET_NAME(tensor));
+    return MODEL_TENSOR_ERROR;
+  }
+
+  if (saveTensor) {
+    Qnn_Tensor_t tensorCopy;
+    VALIDATE(deepCopyQnnTensors(*tensor, tensorCopy), err);
+
+    // save network input/outputs tensors to use for setting the Qnn graph's
+    // input and output tensors for populating GraphInfo_t for caller
+    if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_APP_WRITE) {
+      m_modelInputTensors.push_back(tensorCopy);
+    } else if (QNN_TENSOR_GET_TYPE(tensor) == QNN_TENSOR_TYPE_APP_READ) {
+      m_modelOutputTensors.push_back(tensorCopy);
+    }
+
+    // save created tensors for later lookup to populate graph node construction
+    m_modelTensorsMap[mapEntry] = tensorCopy;
+  }
+
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::addTensor(const char *nodeName, Qnn_Tensor_t tensor,
+                                 bool saveTensor) {
+  return addTensor(nodeName, &tensor, saveTensor);
+}
+
+ModelError_t QnnModel::getQnnTensor(const char *&nodeName,
+                                    const char *&tensorName,
+                                    Qnn_Tensor_t &tensor) {
+  std::string mapEntry = std::string(tensorName);
+  if (m_modelTensorsMap.find(tensorName) == m_modelTensorsMap.end()) {
+    PRINT_ERROR("QnnModel::getQnnTensor() tensor %s not found on node %s\n",
+                mapEntry.c_str(), nodeName);
+    return MODEL_TENSOR_ERROR;
+  }
+  tensor = m_modelTensorsMap[mapEntry];
+
+  return MODEL_NO_ERROR;
+}
+
+// overload for string tensorName
+ModelError_t QnnModel::getQnnTensor(std::string nodeName,
+                                    std::string tensorName,
+                                    Qnn_Tensor_t &tensor) {
+  if (m_modelTensorsMap.find(tensorName) == m_modelTensorsMap.end()) {
+    PRINT_ERROR("QnnModel::getQnnTensor() tensor %s not found on node %s\n",
+                tensorName.c_str(), nodeName.c_str());
+    return MODEL_TENSOR_ERROR;
+  }
+  tensor = m_modelTensorsMap[tensorName];
+
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::addNode(Qnn_OpConfigVersion_t version, const char *name,
+                               const char *packageName, const char *type,
+                               Qnn_Param_t *params, uint32_t numOfParams,
+                               const char **inputNames, uint32_t numOfInputs,
+                               Qnn_Tensor_t *outputTensors,
+                               uint32_t numOfOutputs) {
+  ModelError_t nodeError;
+  Qnn_OpConfig_t opDefinition = QNN_OPCONFIG_INIT;
+  opDefinition.version = version;
+  VALIDATE_OP_CONFIG_VERSION((opDefinition), nodeError);
+
+  // populate Qnn param for node
+  Qnn_Param_t *nodeParams =
+    (Qnn_Param_t *)malloc(numOfParams * sizeof(Qnn_Param_t));
+
+  // populate input tensors for node
+  Qnn_Tensor_t *inputs =
+    (Qnn_Tensor_t *)malloc(numOfInputs * sizeof(Qnn_Tensor_t));
+
+  // populate output tensors of node
+  Qnn_Tensor_t *outputs =
+    (Qnn_Tensor_t *)malloc(numOfOutputs * sizeof(Qnn_Tensor_t));
+
+  if (nodeParams == nullptr || inputs == nullptr || outputs == nullptr) {
+    PRINT_ERROR("QnnModel::addNode() failed for allocate memory for creating "
+                "QNN OpConfig for node %s.\n",
+                name);
+    FREE_MEMORY(nodeParams, inputs, outputs);
+    return MODEL_MEMORY_ALLOCATE_ERROR;
+  }
+  uint32_t nodeParamsCounter = 0;
+  for (size_t i = 0; i < numOfParams; i++) {
+    switch (params[i].paramType) {
+    case QNN_PARAMTYPE_TENSOR: {
+      Qnn_Tensor_t &tensor = params[i].tensorParam;
+      // Note: set saveTensor to false as no need to save tensor beyond this
+      //         function call for params
+      nodeError = addTensor(name, &tensor, false);
+      if (nodeError != MODEL_NO_ERROR) {
+        PRINT_ERROR("QnnModel::addNode() addTensor() failed for tensor param "
+                    "%s on node %s.\n",
+                    QNN_TENSOR_GET_NAME(tensor), name);
+        FREE_MEMORY(nodeParams, inputs, outputs);
+        return nodeError;
+      }
+      nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_TENSOR;
+      nodeParams[nodeParamsCounter].name = params[i].name;
+      nodeParams[nodeParamsCounter++].tensorParam = tensor;
+      break;
+    }
+    case QNN_PARAMTYPE_SCALAR: {
+      nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_SCALAR;
+      nodeParams[nodeParamsCounter].name = params[i].name;
+      nodeParams[nodeParamsCounter++].scalarParam = params[i].scalarParam;
+      break;
+    }
+    default: {
+      PRINT_ERROR("QnnModel::addNode() unknown param type passed for param %s "
+                  "on node %s.\n",
+                  params[i].name, name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return MODEL_PARAMS_ERROR;
+    }
+    }
+  }
+
+  size_t inputsCounter = 0;
+  for (size_t j = 0; j < numOfInputs; j++) {
+    nodeError = getQnnTensor(name, inputNames[j], inputs[inputsCounter++]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n",
+        inputNames[j], name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+  }
+
+  size_t outputsCounter = 0;
+  m_modelOutputTensorMap[name] = {};
+  for (size_t k = 0; k < numOfOutputs; k++) {
+    // create node output tensors first
+    nodeError = addTensor(name, outputTensors[k]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() addTensor() failed for tensor %s on node %s\n",
+        QNN_TENSOR_GET_NAME(outputTensors[k]), name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+    const char *outTensorName = QNN_TENSOR_GET_NAME(outputTensors[k]);
+    m_modelOutputTensorMap[name].push_back(outTensorName);
+    nodeError = getQnnTensor(name, outTensorName, outputs[outputsCounter++]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n",
+        outTensorName, name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+  }
+
+  // define and add node to graph
+  QNN_OP_CFG_SET_NAME(opDefinition, name);
+  QNN_OP_CFG_SET_PACKAGE_NAME(opDefinition, packageName);
+  QNN_OP_CFG_SET_TYPE_NAME(opDefinition, type);
+  QNN_OP_CFG_SET_PARAMS(opDefinition, numOfParams, nodeParams);
+  QNN_OP_CFG_SET_INPUTS(opDefinition, numOfInputs, inputs);
+  QNN_OP_CFG_SET_OUTPUTS(opDefinition, numOfOutputs, outputs);
+
+  if (m_doNodeValidations) {
+    auto validationStatus =
+      m_qnnInterface.backendValidateOpConfig(m_backendHandle, opDefinition);
+    if (validationStatus == QNN_BACKEND_ERROR_NOT_SUPPORTED) {
+      PRINT_DEBUG("QnnModel::addNode() validation API not supported.\n");
+    } else if (validationStatus != QNN_SUCCESS) {
+      PRINT_ERROR("QnnModel::addNode() validating node %s failed.\n", name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return MODEL_GRAPH_ERROR;
+    }
+  }
+
+  if (m_qnnInterface.graphAddNode(m_graph, opDefinition) !=
+      QNN_GRAPH_NO_ERROR) {
+    PRINT_ERROR("QnnModel::addNode() adding node %s failed.\n", name);
+    FREE_MEMORY(nodeParams, inputs, outputs);
+    return MODEL_GRAPH_ERROR;
+  }
+
+  FREE_MEMORY(nodeParams, inputs, outputs);
+  return MODEL_NO_ERROR;
+}
+
+// overload for string tensorName
+ModelError_t QnnModel::addNode(Qnn_OpConfigVersion_t version, const char *name,
+                               const char *packageName, const char *type,
+                               Qnn_Param_t *params, uint32_t numOfParams,
+                               std::vector<std::string> inputNames,
+                               uint32_t numOfInputs,
+                               Qnn_Tensor_t *outputTensors,
+                               uint32_t numOfOutputs) {
+  ModelError_t nodeError;
+  Qnn_OpConfig_t opDefinition = QNN_OPCONFIG_INIT;
+  opDefinition.version = version;
+  VALIDATE_OP_CONFIG_VERSION((opDefinition), nodeError);
+
+  // populate Qnn param for node
+  Qnn_Param_t *nodeParams =
+    (Qnn_Param_t *)malloc(numOfParams * sizeof(Qnn_Param_t));
+
+  // populate input tensors for node
+  Qnn_Tensor_t *inputs =
+    (Qnn_Tensor_t *)malloc(numOfInputs * sizeof(Qnn_Tensor_t));
+
+  // populate output tensors of node
+  Qnn_Tensor_t *outputs =
+    (Qnn_Tensor_t *)malloc(numOfOutputs * sizeof(Qnn_Tensor_t));
+
+  if (nodeParams == nullptr || inputs == nullptr || outputs == nullptr) {
+    PRINT_ERROR("QnnModel::addNode() failed for allocate memory for creating "
+                "QNN OpConfig for node %s.\n",
+                name);
+    FREE_MEMORY(nodeParams, inputs, outputs);
+    return MODEL_MEMORY_ALLOCATE_ERROR;
+  }
+  uint32_t nodeParamsCounter = 0;
+  for (size_t i = 0; i < numOfParams; i++) {
+    switch (params[i].paramType) {
+    case QNN_PARAMTYPE_TENSOR: {
+      Qnn_Tensor_t &tensor = params[i].tensorParam;
+      // Note: set saveTensor to false as no need to save tensor beyond this
+      //         function call for params
+      nodeError = addTensor(name, &tensor, false);
+      if (nodeError != MODEL_NO_ERROR) {
+        PRINT_ERROR("QnnModel::addNode() addTensor() failed for tensor param "
+                    "%s on node %s.\n",
+                    QNN_TENSOR_GET_NAME(tensor), name);
+        FREE_MEMORY(nodeParams, inputs, outputs);
+        return nodeError;
+      }
+      nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_TENSOR;
+      nodeParams[nodeParamsCounter].name = params[i].name;
+      nodeParams[nodeParamsCounter++].tensorParam = tensor;
+      break;
+    }
+    case QNN_PARAMTYPE_SCALAR: {
+      nodeParams[nodeParamsCounter].paramType = QNN_PARAMTYPE_SCALAR;
+      nodeParams[nodeParamsCounter].name = params[i].name;
+      nodeParams[nodeParamsCounter++].scalarParam = params[i].scalarParam;
+      break;
+    }
+    default: {
+      PRINT_ERROR("QnnModel::addNode() unknown param type passed for param %s "
+                  "on node %s.\n",
+                  params[i].name, name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return MODEL_PARAMS_ERROR;
+    }
+    }
+  }
+
+  size_t inputsCounter = 0;
+  for (size_t j = 0; j < numOfInputs; j++) {
+    nodeError = getQnnTensor(name, inputNames[j], inputs[inputsCounter++]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n",
+        inputNames[j].c_str(), name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+  }
+
+  size_t outputsCounter = 0;
+  m_modelOutputTensorMap[name] = {};
+  for (size_t k = 0; k < numOfOutputs; k++) {
+    // create node output tensors first
+    nodeError = addTensor(name, outputTensors[k]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() addTensor() failed for tensor %s on node %s\n",
+        QNN_TENSOR_GET_NAME(outputTensors[k]), name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+    const char *outTensorName = QNN_TENSOR_GET_NAME(outputTensors[k]);
+    m_modelOutputTensorMap[name].push_back(outTensorName);
+    nodeError = getQnnTensor(name, outTensorName, outputs[outputsCounter++]);
+    if (nodeError != MODEL_NO_ERROR) {
+      PRINT_ERROR(
+        "QnnModel::addNode() getQnnTensor() failed for tensor %s on node %s.\n",
+        outTensorName, name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return nodeError;
+    }
+  }
+
+  // define and add node to graph
+  QNN_OP_CFG_SET_NAME(opDefinition, name);
+  QNN_OP_CFG_SET_PACKAGE_NAME(opDefinition, packageName);
+  QNN_OP_CFG_SET_TYPE_NAME(opDefinition, type);
+  QNN_OP_CFG_SET_PARAMS(opDefinition, numOfParams, nodeParams);
+  QNN_OP_CFG_SET_INPUTS(opDefinition, numOfInputs, inputs);
+  QNN_OP_CFG_SET_OUTPUTS(opDefinition, numOfOutputs, outputs);
+
+  if (m_doNodeValidations) {
+    auto validationStatus =
+      m_qnnInterface.backendValidateOpConfig(m_backendHandle, opDefinition);
+    if (validationStatus == QNN_BACKEND_ERROR_NOT_SUPPORTED) {
+      PRINT_DEBUG("QnnModel::addNode() validation API not supported.\n");
+    } else if (validationStatus != QNN_SUCCESS) {
+      PRINT_ERROR("QnnModel::addNode() validating node %s failed.\n", name);
+      FREE_MEMORY(nodeParams, inputs, outputs);
+      return MODEL_GRAPH_ERROR;
+    }
+  }
+
+  if (m_qnnInterface.graphAddNode(m_graph, opDefinition) !=
+      QNN_GRAPH_NO_ERROR) {
+    PRINT_ERROR("QnnModel::addNode() adding node %s failed.\n", name);
+    FREE_MEMORY(nodeParams, inputs, outputs);
+    return MODEL_GRAPH_ERROR;
+  }
+
+  FREE_MEMORY(nodeParams, inputs, outputs);
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::freeCachedTensors() {
+  ModelError_t err = MODEL_NO_ERROR;
+
+  // cleanup cached tensors
+  for (std::map<std::string, Qnn_Tensor_t>::iterator tensorIt =
+         m_modelTensorsMap.begin();
+       tensorIt != m_modelTensorsMap.end();) {
+    Qnn_Tensor_t &tensor = tensorIt->second;
+    if (QNN_TENSOR_GET_TYPE(tensor) != QNN_TENSOR_TYPE_APP_WRITE &&
+        QNN_TENSOR_GET_TYPE(tensor) != QNN_TENSOR_TYPE_APP_READ) {
+      VALIDATE(freeQnnTensor(tensor), err);
+      tensorIt = m_modelTensorsMap.erase(tensorIt);
+    } else {
+      tensorIt++;
+    }
+  }
+
+  return err;
+}
+
+ModelError_t QnnModel::finalize(Qnn_ProfileHandle_t profile,
+                                Qnn_SignalHandle_t signal) {
+  ModelError_t err;
+
+  // finalize the graph
+  if (m_qnnInterface.graphFinalize(m_graph, profile, signal) !=
+      QNN_GRAPH_NO_ERROR) {
+    PRINT_ERROR("QnnModel::finalize() finalizing graph failed.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  VALIDATE(freeCachedTensors(), err);
+
+  return err;
+}
+
+ModelError_t getGraphInfoFromModels(QnnModel *models, uint32_t numModels,
+                                    GraphInfoPtr_t **graphsInfo) {
+  ModelError_t err = MODEL_NO_ERROR;
+  if (models == nullptr || graphsInfo == nullptr || numModels <= 0) {
+    PRINT_ERROR("getGraphInfoFromModels() models and graphsInfo uninitialized "
+                "or number of models is "
+                "<= 0.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  *graphsInfo = (GraphInfo_t **)malloc(numModels * sizeof(GraphInfo_t *));
+  if (*graphsInfo == nullptr) {
+    PRINT_ERROR(
+      "getGraphInfoFromModels() graphsInfo malloc returned nullptr.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  GraphInfo_t *graphArr =
+    (GraphInfo_t *)malloc(numModels * sizeof(GraphInfo_t));
+  if (graphArr == nullptr) {
+    PRINT_ERROR("getGraphInfoFromModels() graphArr malloc returned nullptr.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  for (uint32_t i = 0; i < numModels; i++) {
+    QnnModel &model = models[i];
+    graphArr[i].graph = model.getQnnGraph();
+    graphArr[i].graphName =
+      strnDup(model.getQnnGraphName().c_str(), model.getQnnGraphName().size());
+    if (graphArr[i].graphName == nullptr) {
+      PRINT_ERROR("getGraphInfoFromModels() failed to construct graphName. "
+                  "Received nullptr.\n");
+      return MODEL_GRAPH_ERROR;
+    }
+
+    // allocate and add graph input/output TensorsWrapper. Note: no need to make
+    // deep copies of the tensor's pointer members as they are already allocated
+    // on heap in the addTensor function call.
+    std::vector<Qnn_Tensor_t> graphInputTensors = model.getGraphInputTensors();
+    size_t numInputTensors = graphInputTensors.size();
+    size_t inputTensorsSize = numInputTensors * sizeof(Qnn_Tensor_t);
+    graphArr[i].inputTensors = (Qnn_Tensor_t *)malloc(inputTensorsSize);
+    memscpy(graphArr[i].inputTensors, inputTensorsSize,
+            graphInputTensors.data(), inputTensorsSize);
+    graphArr[i].numInputTensors = (uint32_t)numInputTensors;
+    // allocate and add graph outputTensors
+    std::vector<Qnn_Tensor_t> graphOutputTensors =
+      model.getGraphOutputTensors();
+    size_t numOutputTensors = graphOutputTensors.size();
+    size_t outputTensorsSize = numOutputTensors * sizeof(Qnn_Tensor_t);
+    graphArr[i].outputTensors = (Qnn_Tensor_t *)malloc(outputTensorsSize);
+    memscpy(graphArr[i].outputTensors, outputTensorsSize,
+            graphOutputTensors.data(), outputTensorsSize);
+    graphArr[i].numOutputTensors = (uint32_t)numOutputTensors;
+
+    // have return object point to the populated graph struct
+    (*graphsInfo)[i] = graphArr + i;
+
+    // graph composition is complete by this stage, free if any cached tensors
+    // remaining
+    VALIDATE(model.freeCachedTensors(), err);
+  }
+
+  return err;
+}
+
+ModelError_t getSingleGraphInfoFromModel(QnnModel &model,
+                                         GraphInfoPtr_t *graphInfoPtr) {
+  ModelError_t err = MODEL_NO_ERROR;
+
+  *graphInfoPtr = (GraphInfo_t *)malloc(sizeof(GraphInfo_t));
+  auto graphInfo = *graphInfoPtr;
+  if (graphInfo == nullptr) {
+    PRINT_ERROR(
+      "getGraphInfoFromModels() graphsInfo malloc returned nullptr.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  graphInfo->graph = model.getQnnGraph();
+  graphInfo->graphName =
+    strnDup(model.getQnnGraphName().c_str(), model.getQnnGraphName().size());
+  if (graphInfo->graphName == nullptr) {
+    PRINT_ERROR("getGraphInfoFromModels() failed to construct graphName. "
+                "Received nullptr.\n");
+    return MODEL_GRAPH_ERROR;
+  }
+
+  // allocate and add graph input/output TensorsWrapper. Note: no need to make
+  // deep copies of the tensor's pointer members as they are already allocated
+  // on heap in the addTensor function call.
+  std::vector<Qnn_Tensor_t> graphInputTensors = model.getGraphInputTensors();
+  size_t numInputTensors = graphInputTensors.size();
+  size_t inputTensorsSize = numInputTensors * sizeof(Qnn_Tensor_t);
+  graphInfo->inputTensors = (Qnn_Tensor_t *)malloc(inputTensorsSize);
+  memscpy(graphInfo->inputTensors, inputTensorsSize, graphInputTensors.data(),
+          inputTensorsSize);
+  graphInfo->numInputTensors = (uint32_t)numInputTensors;
+  // allocate and add graph outputTensors
+  std::vector<Qnn_Tensor_t> graphOutputTensors = model.getGraphOutputTensors();
+  size_t numOutputTensors = graphOutputTensors.size();
+  size_t outputTensorsSize = numOutputTensors * sizeof(Qnn_Tensor_t);
+  graphInfo->outputTensors = (Qnn_Tensor_t *)malloc(outputTensorsSize);
+  memscpy(graphInfo->outputTensors, outputTensorsSize,
+          graphOutputTensors.data(), outputTensorsSize);
+  graphInfo->numOutputTensors = (uint32_t)numOutputTensors;
+
+  // graph composition is complete by this stage, free if any cached tensors
+  // remaining
+  VALIDATE(model.freeCachedTensors(), err);
+  return err;
+}
+
+ModelError_t freeGraphsInfo(GraphInfoPtr_t **graphsInfo, uint32_t numGraphs) {
+  if (graphsInfo == nullptr || *graphsInfo == nullptr) {
+    PRINT_ERROR("freeGraphsInfo() invalid graphsInfo.");
+    return MODEL_TENSOR_ERROR;
+  }
+  for (uint32_t i = 0; i < numGraphs; i++) {
+    PRINT_INFO("Freeing graph in freeGraphInfo");
+    free((*graphsInfo)[i]->graphName);
+    freeQnnTensors((*graphsInfo)[i]->inputTensors,
+                   (*graphsInfo)[i]->numInputTensors);
+    freeQnnTensors((*graphsInfo)[i]->outputTensors,
+                   (*graphsInfo)[i]->numOutputTensors);
+  }
+
+  free(**graphsInfo);
+  free(*graphsInfo);
+  *graphsInfo = nullptr;
+
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::freeTensors() {
+
+  for (std::map<std::string, Qnn_Tensor_t>::iterator tensorIt =
+         m_modelTensorsMap.begin();
+       tensorIt != m_modelTensorsMap.end();) {
+    Qnn_Tensor_t &tensor = tensorIt->second;
+
+    tensorIt = m_modelTensorsMap.erase(tensorIt++);
+  }
+
+  return MODEL_NO_ERROR;
+}
+
+ModelError_t QnnModel::clearGraph() {
+
+  m_modelInputTensors.resize(0);
+  m_modelOutputTensors.resize(0);
+
+  m_modelOutputTensorMap.clear();
+  m_graphName.clear();
+
+  return MODEL_NO_ERROR;
+}
+
+} // namespace qnn_wrapper_api
diff --git a/nntrainer/npu/qnn/Model/QnnModel.hpp b/nntrainer/npu/qnn/Model/QnnModel.hpp
new file mode 100644
index 000000000..288dc9075
--- /dev/null
+++ b/nntrainer/npu/qnn/Model/QnnModel.hpp
@@ -0,0 +1,280 @@
+//==============================================================================
+//
+//  Copyright (c) 2019-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#pragma once
+
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "QnnInterface.h"
+#include "QnnLog.h"
+#include "QnnModelPal.hpp"
+#include "WrapperUtils/QnnWrapperUtils.hpp"
+
+namespace qnn_wrapper_api {
+
+class QnnModel {
+public:
+  ~QnnModel() = default;
+
+  /**
+   * @brief Creates a Qnn Graph within given context.
+   *
+   * @param[in] backendHandle A handle to the QNN backend handle which will be
+   * used to query the API symbols
+   *
+   * @param[in] qnnInterface the QNN backend interface to use
+   *
+   * @param[in] context A handler to the context where the model's graph would
+   * be created.
+   *
+   * @param[in] graphName The name to use for creating a graph in the context
+   * provided.
+   *
+   * @param[in] debug If flag is true, sets all tensors created in model to be
+   *                     QNN_TENSOR_TYPE_APP_READ, essentially overwriting what
+   * is set in Qnn_TensorType.
+   *
+   * @param[in] doNodeValidations If flag is set, all nodes added with addNode
+   * call will be validated by Backend
+   *
+   * @param[in] graphConfigs Array of graph configurations to use for creating
+   * the QNN Graph. Default: nullptr
+   *
+   */
+  ModelError_t initialize(const Qnn_BackendHandle_t &backendHandle,
+                          const QNN_INTERFACE_VER_TYPE &qnnInterface,
+                          const Qnn_ContextHandle_t &context,
+                          const char *graphName, bool debug,
+                          uint8_t doNodeValidations = 1,
+                          const QnnGraph_Config_t **graphConfigs = nullptr);
+
+  /**
+   * @brief A wrapper function to create a tensor inside class's context graph.
+   *
+   * @param[in] nodeName Lookup name for node/layer
+   *
+   * @param[in] tensor A pointer to a struct containing information on the
+   * tensor
+   *
+   * @param[in] saveTensor Flag to indicate if tensor should be saved in object
+   * for later retrieval with class getter functions.
+   *
+   * @return Error code
+   *
+   */
+  ModelError_t addTensor(const char *nodeName, Qnn_Tensor_t *tensor,
+                         bool saveTensor = true);
+
+  /**
+   * @brief A wrapper function to create a tensor inside class's context graph.
+   *
+   * @param[in] nodeName Lookup name for node/layer
+   *
+   * @param[in] tensor A struct containing information on the tensor
+   *
+   * @param[in] saveTensor Flag to indicate if tensor should be saved in object
+   * for later retrieval with class getter functions.
+   *
+   * @return Error code
+   *
+   */
+  ModelError_t addTensor(const char *nodeName, Qnn_Tensor_t tensor,
+                         bool saveTensor = true);
+
+  /**
+   * @brief function to be used to query tensors created within this QnnModel
+   * instance
+   *
+   * @param[in] nodeName Lookup name for node/layer
+   *
+   * @param[in] tensorName Lookup name for tensor
+   *
+   * @param[out] tensor The corresponding Qnn_Tensor_t object for given tensor
+   * name.
+   *
+   * @return Error code
+   *
+   */
+  ModelError_t getQnnTensor(const char *&nodeName, const char *&tensorName,
+                            Qnn_Tensor_t &tensor);
+  ModelError_t getQnnTensor(std::string nodeName, std::string tensorName,
+                            Qnn_Tensor_t &tensor);
+
+  /**
+   * @brief A wrapper function to create a node in class's graph.
+   *
+   * @param[in] version The QNN version for Op_Config_t structure to use (e.g.
+   * QNN_OPCONFIG_VERSION_1)
+   *
+   * @param[in] name The node name to use (e.g. my_graph_conv_1)
+   *
+   * @param[in] packageName The node package name (e.g. qti.aisw)
+   *
+   * @param[in] type The QNN_OP_QNN_OP_H node type (e.g. QNN_OP_ARGMAX)
+   *
+   * @param[in] params A struct object containing all the params for the node to
+   * be added. For tensorParam case. The tensor will be created within the
+   * function and the data will be retrieved from the binary blob to set the
+   * tensor data.
+   *
+   * @param[in] numOfParams The number of elements in above params object
+   *
+   * @param[in] inputNames List of tensor names for inputs to node. Note: the
+   * corresponding qnn tensor objects must be created within this instance prior
+   * to being listed as input to a node
+   *
+   * @param[in] numOfInputs The number of elements in above inputNames object
+   *
+   * @param[in] outputTensors List of Qnn_Tensor_t objects for outputs from
+   * node. Note1: the corresponding qnn tensor objects will be created in
+   * function and must not already exist. Note2: the output names must be unique
+   * per graph
+   *
+   * @param[in] numOfOutputs The number of elements in above outputs object
+   *
+   * @return Error code
+   *
+   */
+  ModelError_t addNode(Qnn_OpConfigVersion_t version, const char *name,
+                       const char *packageName, const char *type,
+                       Qnn_Param_t *params, uint32_t numOfParams,
+                       const char **inputNames, uint32_t numOfInputs,
+                       Qnn_Tensor_t *outputTensors, uint32_t numOfOutputs);
+  // overload for vector of inputNames
+  ModelError_t addNode(Qnn_OpConfigVersion_t version, const char *name,
+                       const char *packageName, const char *type,
+                       Qnn_Param_t *params, uint32_t numOfParams,
+                       std::vector<std::string> inputNames,
+                       uint32_t numOfInputs, Qnn_Tensor_t *outputTensors,
+                       uint32_t numOfOutputs);
+
+  /**
+   * @brief A wrapper function to return model's graph
+   *
+   * @return The Qnn graph object
+   *
+   */
+  Qnn_GraphHandle_t getQnnGraph() { return m_graph; }
+
+  /**
+   * @brief A wrapper function to return model's graphName
+   *
+   * @return The Qnn graph object's name
+   *
+   */
+  std::string getQnnGraphName() { return m_graphName; }
+
+  /**
+   * @brief A wrapper function to return model's graph input tensors
+   *
+   * @return vector of Qnn_Tensor_t objects
+   *
+   */
+  std::vector<Qnn_Tensor_t> getGraphInputTensors() {
+    return m_modelInputTensors;
+  }
+
+  /**
+   * @brief A wrapper function to return model's graph output tensors
+   *
+   * @return vector of Qnn_Tensor_t objects
+   *
+   */
+  std::vector<Qnn_Tensor_t> getGraphOutputTensors() {
+    return m_modelOutputTensors;
+  }
+
+  /**
+   * @brief A wrapper function to return graph's output tensors->op mapping
+   *
+   * @return map of std::string, std::vector<std::string>
+   *
+   */
+  std::map<std::string, std::vector<std::string>> getOutputTensorMap() {
+    return m_modelOutputTensorMap;
+  }
+
+  /**
+   * @brief A wrapper function to finalize model's graph which includes calling
+   * backend finalize on graph.
+   *
+   * @return Error code
+   *
+   */
+  ModelError_t finalize(Qnn_ProfileHandle_t profile = nullptr,
+                        Qnn_SignalHandle_t signal = nullptr);
+
+  /**
+   * @brief Removes saved Qnn_Tensor_t objects and frees memory
+   *        Note: Cleanup doesnt apply to input/output tensors as they are
+   * needed beyond this class finishes graph construction for the execute call.
+   * User of this API is expected to free those.
+   *
+   * @return Error code
+   */
+  ModelError_t freeCachedTensors();
+
+  ModelError_t freeTensors();
+
+  ModelError_t clearGraph();
+
+private:
+  Qnn_GraphHandle_t m_graph = nullptr;
+  std::string m_graphName;
+  bool m_debug =
+    false; // flag to indicate if requested graph is to be run in debug mode
+  // (i.e. all intermediate tensors will be accessible to client)
+  // flag to indicate whether all addNode calls need to be validated
+  bool m_doNodeValidations = true;
+
+  std::vector<Qnn_Tensor_t> m_modelInputTensors;
+  std::vector<Qnn_Tensor_t> m_modelOutputTensors;
+  // keeps track of graph tensors to enable creating Qnn nodes from tensor names
+  std::map<std::string, Qnn_Tensor_t> m_modelTensorsMap;
+  std::map<std::string, std::vector<std::string>> m_modelOutputTensorMap;
+
+  // Qnn Backend Interface Api
+  QNN_INTERFACE_VER_TYPE m_qnnInterface;
+  Qnn_BackendHandle_t m_backendHandle;
+
+}; // QNN_MODEL_CLASS
+
+/**
+ * @brief A helper function to convert QnnModel objects to Graph struct for
+ * qnn_model c interface
+ * @param[in] models List of QnnModel objects
+ * @param[in] numModels The number of elements in above models object
+ *
+ * @param[out] graphsInfo The corresponding array of Graph object for each of
+ * the above model objects(note: this function will malloc memory needed to
+ * store the struct objects. Following free shall be invoked when objects are no
+ * longer needed.
+ *
+ * @return Error code
+ *
+ */
+ModelError_t getGraphInfoFromModels(QnnModel *models, uint32_t numModels,
+                                    GraphInfoPtr_t **graphsInfo);
+ModelError_t getSingleGraphInfoFromModel(QnnModel &model,
+                                         GraphInfoPtr_t *graphInfoPtr);
+
+/**
+ * @brief A helper function to free memory malloced for communicating the Graph
+ * for a model(s)
+ * @param[in] graphsInfo Pointer pointing to location of graph objects
+ * @param[in] numGraphs The number of graph objects the above pointer is
+ * pointing to
+ *
+ * @return Error code
+ *
+ */
+ModelError_t freeGraphsInfo(GraphInfoPtr_t **graphsInfo, uint32_t numGraphs);
+} // namespace qnn_wrapper_api
diff --git a/nntrainer/npu/qnn/Model/QnnModelPal.cpp b/nntrainer/npu/qnn/Model/QnnModelPal.cpp
new file mode 100644
index 000000000..7858b29de
--- /dev/null
+++ b/nntrainer/npu/qnn/Model/QnnModelPal.cpp
@@ -0,0 +1,29 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "QnnModelPal.hpp"
+
+namespace qnn_wrapper_api {
+void *dlSym(void *handle, const char *symbol) {
+  if (handle == DL_DEFAULT) {
+    return ::dlsym(RTLD_DEFAULT, symbol);
+  }
+
+  return ::dlsym(handle, symbol);
+}
+
+char *dlError(void) { return ::dlerror(); }
+
+char *strnDup(const char *source, size_t maxlen) {
+  return ::strndup(source, maxlen);
+}
+} // namespace qnn_wrapper_api
diff --git a/nntrainer/npu/qnn/Model/QnnModelPal.hpp b/nntrainer/npu/qnn/Model/QnnModelPal.hpp
new file mode 100644
index 000000000..e72273f43
--- /dev/null
+++ b/nntrainer/npu/qnn/Model/QnnModelPal.hpp
@@ -0,0 +1,54 @@
+//==============================================================================
+//
+//  Copyright (c) 2021-2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+namespace qnn_wrapper_api {
+
+// specify this address to distingiush from NULL pointer
+#define DL_DEFAULT (void *)(0x4)
+
+//---------------------------------------------------------------------------
+/// @brief
+///   obtain address of a symbol in a shared object or executable
+/// @handle
+///   a handle of a dynamic loaded shared object returned by dlopen
+/// @symbol
+///   a null-terminated symbol name
+/// @return
+///   On success, return the address associated with symbol
+///   On error, NULL
+//---------------------------------------------------------------------------
+void *dlSym(void *handle, const char *symbol);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   obtain error diagnostic for functions in the dl-family APIs.
+/// @return
+///   returns a human-readable, null-terminated string describing the most
+///   recent error that occurred from a call to one of the functions in the
+///   dl-family APIs.
+///
+//---------------------------------------------------------------------------
+char *dlError(void);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Returns a pointer to a null-terminated byte string, which contains copies
+///   of at most maxlen bytes from the string pointed to by str. If the null
+///   terminator is not encountered in the first maxlen bytes, it is added to
+///   the duplicated string.
+/// @source
+///   Null-terminated source string.
+/// @maxlen
+///   Max number of bytes to copy from str
+/// @return
+///   A pointer to the newly allocated string, or a null pointer if an error
+///   occurred.
+///
+//---------------------------------------------------------------------------
+char *strnDup(const char *source, size_t maxlen);
+} // namespace qnn_wrapper_api