diff --git a/nntrainer/layers/cl_layers/concat_cl.cpp b/nntrainer/layers/cl_layers/concat_cl.cpp
index 0d230a526f..5b04a48100 100644
--- a/nntrainer/layers/cl_layers/concat_cl.cpp
+++ b/nntrainer/layers/cl_layers/concat_cl.cpp
@@ -24,44 +24,6 @@
 #include <tensor_dim.h>
 #include <util_func.h>
 
-std::string concat_cl_axis3_kernel_fp16_ =
-  R"(
-    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    __kernel void concat_cl_axis3_fp16(__global const half* in1, 
-                                           __global const half* in2, 
-                                           __global half* out,
-                                           const int batch_size, 
-                                           const int channels, 
-                                           const int height, 
-                                           const int width1, 
-                                           const int width2) {
-    int global_id = get_global_id(0);
-    
-    int total_width = width1 + width2;
-    
-    int width = total_width;
-
-    // 4D space coordinates
-    int w = global_id % total_width;
-    int h = (global_id / total_width) % height;
-    int c = (global_id / (total_width * height)) % channels;
-    int b = global_id / (total_width * height * channels);
-
-    int output_index = ((b * channels + c) * height + h) * total_width + w;
-    
-    // Determining if the index is in in1 or in2
-    if (w < width1) {
-        // in1 index calculation
-        int input1_index = ((b * channels + c) * height + h) * width1 + w;
-        out[output_index] = in1[input1_index];
-  
-    } else {
-        // in2 index calculation
-        int input2_index = ((b * channels + c) * height + h) * width2 + (w - width1);
-        out[output_index] = in2[input2_index];
-    }
-})";
-
 std::string concat_cl_axis3_kernel_ =
   R"(__kernel void concat_cl_axis3(__global const float* in1, 
                                            __global const float* in2, 
@@ -98,40 +60,6 @@ std::string concat_cl_axis3_kernel_ =
     }
 })";
 
-std::string concat_cl_axis2_kernel_fp16_ =
-  R"(__kernel void concat_cl_axis2_fp16(__global const half* in1,
-                             __global const half* in2,
-                             __global half* out,
-                             const int batch_size,
-                             const int channels,
-                             const int height1,
-                             const int height2,
-                             const int width) {
-    
-    int total_height = height1 + height2;
-    int global_id = get_global_id(0);
-    
-    // Calculate the coordinates in the 4D space
-    int w = global_id % width;
-    int h = (global_id / width) % total_height;
-    int c = (global_id / (width * total_height)) % channels;
-    int b = global_id / (width * total_height * channels);
-
-    // Calculate the offset for the current batch, channel, and width in the output tensor
-    int output_index = ((b * channels + c) * total_height + h) * width + w;
-
-    if (h < height1) {
-        // Index within input1
-        int input1_index = ((b * channels + c) * height1 + h) * width + w;
-        out[output_index] = in1[input1_index];
-    } else {
-        // Index within input2
-        int input2_index = ((b * channels + c) * height2 + (h - height1)) * width + w;
-        out[output_index] = in2[input2_index];
-    }
-
-})";
-
 std::string concat_cl_axis2_kernel_ =
   R"(__kernel void concat_cl_axis2(__global const float* in1,
                              __global const float* in2,
@@ -166,39 +94,6 @@ std::string concat_cl_axis2_kernel_ =
 
 })";
 
-std::string concat_cl_axis1_kernel_fp16_ =
-  R"(__kernel void concat_cl_axis1_fp16(__global const half* in1, 
-                                           __global const half* in2, 
-                                           __global half* out,
-                                           const int batch_size, 
-                                           const int channels1, 
-                                           const int channels2, 
-                                           const int height, 
-                                           const int width) {
-    int global_id = get_global_id(0);
-    
-    int total_channels = channels1 + channels2;
-
-    // Calculate the coordinates in the 4D space
-    int w = global_id % width;
-    int h = (global_id / width) % height;
-    int c = (global_id / (width * height)) % total_channels;
-    int b = global_id / (width * height * total_channels);
-
-    // Calculate the offset for the current batch, height, and width in the output tensor
-    int output_index = ((b * total_channels + c) * height + h) * width + w;
-
-    if (c < channels1) {
-        // Index within input1
-        int input1_index = ((b * channels1 + c) * height + h) * width + w;
-        out[output_index] = in1[input1_index];
-    } else {
-        // Index within input2
-        int input2_index = ((b * channels2 + (c - channels1)) * height + h) * width + w;
-        out[output_index] = in2[input2_index];
-    }
-})";
-
 std::string concat_cl_axis1_kernel_ =
   R"(__kernel void concat_cl_axis1(__global const float* in1, 
                                            __global const float* in2, 
@@ -303,11 +198,8 @@ void ConcatLayerCl::incremental_forwarding(RunLayerContext &context,
 }
 
 opencl::Kernel ConcatLayerCl::kernel_concat_axis3;
-opencl::Kernel ConcatLayerCl::kernel_concat_axis3_fp16;
 opencl::Kernel ConcatLayerCl::kernel_concat_axis2;
-opencl::Kernel ConcatLayerCl::kernel_concat_axis2_fp16;
 opencl::Kernel ConcatLayerCl::kernel_concat_axis1;
-opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16;
 
 void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
                                   Tensor &result) {
@@ -321,7 +213,6 @@ void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
   input1_height = dim1.height();
   input1_channels = dim1.channel();
   input1_width = dim1.width();
-
   input2_height = dim2.height();
   input2_channels = dim2.channel();
   input2_width = dim2.width();
@@ -477,117 +368,6 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata,
   } while (false);
 }
 
-void ConcatLayerCl::concat_cl_axis3_fp16(
-  const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_channels,
-  unsigned int input1_height, unsigned int input1_width,
-  unsigned int input2_width) {
-
-  bool result = false;
-
-  do {
-    ClContext::SharedPtrClKernel kernel_concat_ptr =
-      cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_,
-                                      "concat_cl_axis3_fp16");
-    if (!kernel_concat_ptr) {
-      break;
-    }
-
-    int dim = int(input1_batch_size * input1_channels * input1_height *
-                  (input1_width + input2_width));
-
-    opencl::Buffer inputA(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input1_height * input1_width,
-                          true, nullptr);
-
-    opencl::Buffer inputX(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input1_height * input2_width,
-                          true, nullptr);
-
-    opencl::Buffer inOutY(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input1_height * (input1_width + input2_width),
-                          true, nullptr);
-
-    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
-    if (!result) {
-      break;
-    }
-
-    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    const int work_groups_count[3] = {dim, 1, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    result = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_concat_ptr, work_groups_count, work_group_size);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-  } while (false);
-}
-
 void ConcatLayerCl::concat_cl_axis2(const float *matAdata,
                                     const float *vecXdata, float *vecYdata,
                                     unsigned int input1_batch_size,
@@ -701,117 +481,6 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata,
   } while (false);
 }
 
-void ConcatLayerCl::concat_cl_axis2_fp16(
-  const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_channels,
-  unsigned int input1_width, unsigned int input1_height,
-  unsigned int input2_height) {
-
-  bool result = false;
-
-  do {
-    ClContext::SharedPtrClKernel kernel_concat_ptr =
-      cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_,
-                                      "concat_cl_axis2_fp16");
-    if (!kernel_concat_ptr) {
-      break;
-    }
-
-    int dim = int(input1_batch_size * input1_channels * input1_width *
-                  (input1_height + input2_height));
-
-    opencl::Buffer inputA(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input1_height * input1_width,
-                          true, nullptr);
-
-    opencl::Buffer inputX(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input2_height * input1_width,
-                          true, nullptr);
-
-    opencl::Buffer inOutY(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            (input1_height + input2_height) * input1_width,
-                          true, nullptr);
-
-    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
-    if (!result) {
-      break;
-    }
-
-    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    const int work_groups_count[3] = {dim, 1, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    result = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_concat_ptr, work_groups_count, work_group_size);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-  } while (false);
-}
-
 void ConcatLayerCl::concat_cl_axis1(const float *matAdata,
                                     const float *vecXdata, float *vecYdata,
                                     unsigned int input1_batch_size,
@@ -925,117 +594,6 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata,
   } while (false);
 }
 
-void ConcatLayerCl::concat_cl_axis1_fp16(
-  const _FP16 *matAdata, const _FP16 *vecXdata, _FP16 *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_height,
-  unsigned int input1_width, unsigned int input1_channels,
-  unsigned int input2_channels) {
-
-  bool result = false;
-
-  do {
-    ClContext::SharedPtrClKernel kernel_concat_ptr =
-      cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_,
-                                      "concat_cl_axis1_fp16");
-    if (!kernel_concat_ptr) {
-      break;
-    }
-
-    int dim = int(input1_batch_size * input1_width * input1_height *
-                  (input1_channels + input2_channels));
-
-    opencl::Buffer inputA(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_channels *
-                            input1_height * input1_width,
-                          true, nullptr);
-
-    opencl::Buffer inputX(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input2_channels *
-                            input1_height * input1_width,
-                          true, nullptr);
-
-    opencl::Buffer inOutY(cl_context_ref.context_inst_,
-                          sizeof(_FP16) * input1_batch_size * input1_width *
-                            input1_height * (input1_channels + input2_channels),
-                          true, nullptr);
-
-    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
-    if (!result) {
-      break;
-    }
-
-    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    const int work_groups_count[3] = {dim, 1, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    result = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_concat_ptr, work_groups_count, work_group_size);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-  } while (false);
-}
-
 void ConcatLayerCl::calcDerivative(RunLayerContext &context) {
   //   /**
   //    * @todo skipping calcDerivative, support yet to be added
diff --git a/nntrainer/layers/cl_layers/concat_cl.h b/nntrainer/layers/cl_layers/concat_cl.h
index ce8599409e..ad52b48e32 100644
--- a/nntrainer/layers/cl_layers/concat_cl.h
+++ b/nntrainer/layers/cl_layers/concat_cl.h
@@ -106,12 +106,15 @@ class ConcatLayerCl : public Layer {
 
   inline static const std::string type = "concat";
 
-  static opencl::Kernel kernel_concat_axis3;
-  static opencl::Kernel kernel_concat_axis3_fp16;
-  static opencl::Kernel kernel_concat_axis2;
-  static opencl::Kernel kernel_concat_axis2_fp16;
   static opencl::Kernel kernel_concat_axis1;
+  static opencl::Kernel kernel_concat_axis2;
+  static opencl::Kernel kernel_concat_axis3;
+
+#ifdef ENABLE_FP16
   static opencl::Kernel kernel_concat_axis1_fp16;
+  static opencl::Kernel kernel_concat_axis2_fp16;
+  static opencl::Kernel kernel_concat_axis3_fp16;
+#endif
 
   /**
    * @brief Process data and dimensions for concat
@@ -174,6 +177,7 @@ class ConcatLayerCl : public Layer {
                        unsigned int input2_channels);
 
 #ifdef ENABLE_FP16
+
   /**
    * @brief     concat computation for axis 3 fp16
    * @param[in] matAdata fp16 * for Input Tensor A
@@ -231,6 +235,7 @@ class ConcatLayerCl : public Layer {
                             unsigned int input1_channels,
                             unsigned int input2_channelst);
 #endif
+
 private:
   std::tuple<props::ConcatDimension> concat_props;
 };
diff --git a/nntrainer/layers/cl_layers/concat_cl_fp16.cpp b/nntrainer/layers/cl_layers/concat_cl_fp16.cpp
new file mode 100644
index 0000000000..ec90f66793
--- /dev/null
+++ b/nntrainer/layers/cl_layers/concat_cl_fp16.cpp
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024  Niket Agarwal <niket.a@samsung.com>
+ *
+ * @file   concat_cl.cpp
+ * @date   2 July 2024
+ * @brief  Implementation of Concat Layer
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @author Eunju Yang <ej.yang@samsung.com>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#include <cstring>
+#include <vector>
+
+#include <concat_cl.h>
+#include <iostream>
+#include <layer_context.h>
+#include <nntr_threads.h>
+#include <nntrainer_error.h>
+#include <nntrainer_log.h>
+#include <node_exporter.h>
+#include <tensor_dim.h>
+#include <util_func.h>
+
+std::string concat_cl_axis3_kernel_fp16_ =
+  R"(
+    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+    __kernel void concat_cl_axis3_fp16(__global const half* in1, 
+                                           __global const half* in2, 
+                                           __global half* out,
+                                           const int batch_size, 
+                                           const int channels, 
+                                           const int height, 
+                                           const int width1, 
+                                           const int width2) {
+    int global_id = get_global_id(0);
+    
+    int total_width = width1 + width2;
+    
+    int width = total_width;
+
+    // 4D space coordinates
+    int w = global_id % total_width;
+    int h = (global_id / total_width) % height;
+    int c = (global_id / (total_width * height)) % channels;
+    int b = global_id / (total_width * height * channels);
+
+    int output_index = ((b * channels + c) * height + h) * total_width + w;
+    
+    // Determining if the index is in in1 or in2
+    if (w < width1) {
+        // in1 index calculation
+        int input1_index = ((b * channels + c) * height + h) * width1 + w;
+        out[output_index] = in1[input1_index];
+  
+    } else {
+        // in2 index calculation
+        int input2_index = ((b * channels + c) * height + h) * width2 + (w - width1);
+        out[output_index] = in2[input2_index];
+    }
+})";
+
+std::string concat_cl_axis2_kernel_fp16_ =
+  R"(__kernel void concat_cl_axis2_fp16(__global const half* in1,
+                             __global const half* in2,
+                             __global half* out,
+                             const int batch_size,
+                             const int channels,
+                             const int height1,
+                             const int height2,
+                             const int width) {
+    
+    int total_height = height1 + height2;
+    int global_id = get_global_id(0);
+    
+    // Calculate the coordinates in the 4D space
+    int w = global_id % width;
+    int h = (global_id / width) % total_height;
+    int c = (global_id / (width * total_height)) % channels;
+    int b = global_id / (width * total_height * channels);
+
+    // Calculate the offset for the current batch, channel, and width in the output tensor
+    int output_index = ((b * channels + c) * total_height + h) * width + w;
+
+    if (h < height1) {
+        // Index within input1
+        int input1_index = ((b * channels + c) * height1 + h) * width + w;
+        out[output_index] = in1[input1_index];
+    } else {
+        // Index within input2
+        int input2_index = ((b * channels + c) * height2 + (h - height1)) * width + w;
+        out[output_index] = in2[input2_index];
+    }
+
+})";
+
+std::string concat_cl_axis1_kernel_fp16_ =
+  R"(__kernel void concat_cl_axis1_fp16(__global const half* in1, 
+                                           __global const half* in2, 
+                                           __global half* out,
+                                           const int batch_size, 
+                                           const int channels1, 
+                                           const int channels2, 
+                                           const int height, 
+                                           const int width) {
+    int global_id = get_global_id(0);
+    
+    int total_channels = channels1 + channels2;
+
+    // Calculate the coordinates in the 4D space
+    int w = global_id % width;
+    int h = (global_id / width) % height;
+    int c = (global_id / (width * height)) % total_channels;
+    int b = global_id / (width * height * total_channels);
+
+    // Calculate the offset for the current batch, height, and width in the output tensor
+    int output_index = ((b * total_channels + c) * height + h) * width + w;
+
+    if (c < channels1) {
+        // Index within input1
+        int input1_index = ((b * channels1 + c) * height + h) * width + w;
+        out[output_index] = in1[input1_index];
+    } else {
+        // Index within input2
+        int input2_index = ((b * channels2 + (c - channels1)) * height + h) * width + w;
+        out[output_index] = in2[input2_index];
+    }
+})";
+
+namespace nntrainer {
+
+static constexpr size_t SINGLE_INOUT_IDX = 0;
+static constexpr size_t INPUT_IDX_1 = 0;
+static constexpr size_t INPUT_IDX_2 = 1;
+
+opencl::Kernel ConcatLayerCl::kernel_concat_axis3_fp16;
+opencl::Kernel ConcatLayerCl::kernel_concat_axis2_fp16;
+opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16;
+
+void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata,
+                                         const _FP16 *vecXdata, _FP16 *vecYdata,
+                                         unsigned int input1_batch_size,
+                                         unsigned int input1_channels,
+                                         unsigned int input1_height,
+                                         unsigned int input1_width,
+                                         unsigned int input2_width) {
+
+  bool result = false;
+
+  do {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_,
+                                      "concat_cl_axis3_fp16");
+    if (!kernel_concat_ptr) {
+      break;
+    }
+
+    int dim = int(input1_batch_size * input1_channels * input1_height *
+                  (input1_width + input2_width));
+
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input1_height * input1_width,
+                          true, nullptr);
+
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input1_height * input2_width,
+                          true, nullptr);
+
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input1_height * (input1_width + input2_width),
+                          true, nullptr);
+
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {dim, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata,
+                                         const _FP16 *vecXdata, _FP16 *vecYdata,
+                                         unsigned int input1_batch_size,
+                                         unsigned int input1_channels,
+                                         unsigned int input1_width,
+                                         unsigned int input1_height,
+                                         unsigned int input2_height) {
+
+  bool result = false;
+
+  do {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_,
+                                      "concat_cl_axis2_fp16");
+    if (!kernel_concat_ptr) {
+      break;
+    }
+
+    int dim = int(input1_batch_size * input1_channels * input1_width *
+                  (input1_height + input2_height));
+
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input1_height * input1_width,
+                          true, nullptr);
+
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input2_height * input1_width,
+                          true, nullptr);
+
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            (input1_height + input2_height) * input1_width,
+                          true, nullptr);
+
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {dim, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata,
+                                         const _FP16 *vecXdata, _FP16 *vecYdata,
+                                         unsigned int input1_batch_size,
+                                         unsigned int input1_height,
+                                         unsigned int input1_width,
+                                         unsigned int input1_channels,
+                                         unsigned int input2_channels) {
+
+  bool result = false;
+
+  do {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_,
+                                      "concat_cl_axis1_fp16");
+    if (!kernel_concat_ptr) {
+      break;
+    }
+
+    int dim = int(input1_batch_size * input1_width * input1_height *
+                  (input1_channels + input2_channels));
+
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_channels *
+                            input1_height * input1_width,
+                          true, nullptr);
+
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input2_channels *
+                            input1_height * input1_width,
+                          true, nullptr);
+
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * input1_batch_size * input1_width *
+                            input1_height * (input1_channels + input2_channels),
+                          true, nullptr);
+
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {dim, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build
index fbfd46961b..57fb942bc9 100644
--- a/nntrainer/layers/cl_layers/meson.build
+++ b/nntrainer/layers/cl_layers/meson.build
@@ -7,6 +7,15 @@ cl_layer_sources = [
    'concat_cl.cpp',
 ]
 
+if get_option('enable-fp16')
+  cl_layer_sources += [
+    'swiglu_cl_fp16.cpp',
+    'reshape_cl_fp16.cpp',
+    'rmsnorm_layer_cl_fp16.cpp',
+    'concat_cl_fp16.cpp',
+  ]
+endif
+
 foreach s : cl_layer_sources
   nntrainer_sources += meson.current_source_dir() / s
 endforeach
diff --git a/nntrainer/layers/cl_layers/reshape_cl.cpp b/nntrainer/layers/cl_layers/reshape_cl.cpp
index 724e38c549..5e48bbeb02 100644
--- a/nntrainer/layers/cl_layers/reshape_cl.cpp
+++ b/nntrainer/layers/cl_layers/reshape_cl.cpp
@@ -17,21 +17,6 @@
 #include <node_exporter.h>
 #include <reshape_cl.h>
 
-std::string copy_cl_kernel_fp16_ =
-  R"(
-    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    __kernel void copy_cl_fp16(__global const half* input, 
-                               __global half* output,
-                               const int batchsize, 
-                               const int channels, 
-                               const int height, 
-                               const int width) {
-
-    int i= get_global_id(0);
-    output[i] = input[i];
-    
-})";
-
 std::string copy_cl_kernel_ =
   R"(__kernel void copy_cl(__global const float* input, 
                                __global float* output,
@@ -99,7 +84,6 @@ void ReshapeLayerCl::incremental_forwarding(RunLayerContext &context,
 }
 
 opencl::Kernel ReshapeLayerCl::kernel_copy;
-opencl::Kernel ReshapeLayerCl::kernel_copy_fp16;
 
 void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) {
 
@@ -127,89 +111,6 @@ void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) {
   }
 }
 
-void ReshapeLayerCl::copy_cl_fp16(const _FP16 *input, _FP16 *res,
-                                  unsigned int input_batch_size,
-                                  unsigned int input_channels,
-                                  unsigned int input_height,
-                                  unsigned int input_width) {
-
-  bool result = false;
-
-  do {
-    ClContext::SharedPtrClKernel kernel_copy_ptr =
-      cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16");
-    if (!kernel_copy_ptr) {
-      break;
-    }
-
-    size_t dim_size = sizeof(_FP16) * input_batch_size * input_height *
-                      input_width * input_channels;
-
-    opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
-                          nullptr);
-
-    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
-                            nullptr);
-
-    result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
-    if (!result) {
-      break;
-    }
-
-    result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
-    if (!result) {
-      break;
-    }
-
-    result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result =
-      kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
-    if (!result) {
-      break;
-    }
-
-    const int work_groups_count[3] = {(int)dim_size, 1, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    result = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_copy_ptr, work_groups_count, work_group_size);
-    if (!result) {
-      break;
-    }
-
-    result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
-    if (!result) {
-      break;
-    }
-
-  } while (false);
-}
-
 void ReshapeLayerCl::copy_cl(const float *input, float *res,
                                   unsigned int input_batch_size,
                                   unsigned int input_channels,
diff --git a/nntrainer/layers/cl_layers/reshape_cl.h b/nntrainer/layers/cl_layers/reshape_cl.h
index 679214189e..9c83beb360 100644
--- a/nntrainer/layers/cl_layers/reshape_cl.h
+++ b/nntrainer/layers/cl_layers/reshape_cl.h
@@ -106,7 +106,10 @@ class ReshapeLayerCl : public Layer {
   inline static const std::string type = "reshape";
 
   static opencl::Kernel kernel_copy;
+
+#ifdef ENABLE_FP16
   static opencl::Kernel kernel_copy_fp16;
+#endif
 
   /**
    * @brief Process data and dimensions for reshape operation
diff --git a/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp b/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp
new file mode 100644
index 0000000000..9936bc4ef1
--- /dev/null
+++ b/nntrainer/layers/cl_layers/reshape_cl_fp16.cpp
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Niket Agarwal <niket.a@samsung.com>
+ *
+ * @file   reshape_cl.cpp
+ * @date   18 June 2024
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @author Eunju Yang <ej.yang@samsung.com>
+ * @bug	   No known bugs except for NYI items
+ * @brief  This is Reshape GPU Layer Implementation
+ */
+
+#include <iostream>
+#include <layer_context.h>
+#include <nntrainer_error.h>
+#include <nntrainer_log.h>
+#include <node_exporter.h>
+#include <reshape_cl.h>
+
+std::string copy_cl_kernel_fp16_ =
+  R"(
+    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+    __kernel void copy_cl_fp16(__global const half* input, 
+                               __global half* output,
+                               const int batchsize, 
+                               const int channels, 
+                               const int height, 
+                               const int width) {
+
+    int i= get_global_id(0);
+    output[i] = input[i];
+    
+})";
+
+namespace nntrainer {
+
+opencl::Kernel ReshapeLayerCl::kernel_copy_fp16;
+
+void ReshapeLayerCl::copy_cl_fp16(const _FP16 *input, _FP16 *res,
+                                  unsigned int input_batch_size,
+                                  unsigned int input_channels,
+                                  unsigned int input_height,
+                                  unsigned int input_width) {
+
+  bool result = false;
+
+  do {
+    ClContext::SharedPtrClKernel kernel_copy_ptr =
+      cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16");
+    if (!kernel_copy_ptr) {
+      break;
+    }
+
+    size_t dim_size = sizeof(_FP16) * input_batch_size * input_height *
+                      input_width * input_channels;
+
+    opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
+                          nullptr);
+
+    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
+                            nullptr);
+
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
+    if (!result) {
+      break;
+    }
+
+    result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result =
+      kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {(int)dim_size, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_copy_ptr, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+} /* namespace nntrainer */
+
diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp
index 2df9caf29e..137e94a90c 100644
--- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp
+++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp
@@ -20,41 +20,6 @@
 #include <rmsnorm_layer_cl.h>
 #include <util_func.h>
 
-std::string rmsnorm_cl_kernel_fp16_ =
-  R"(
-    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    __kernel void rmsnorm_cl_fp16(
-    __global const half *input,  // Input tensor
-    __global half *output,    // Output tensor
-    __global const half *alpha,  // Alpha values (one for each width)
-    half epsilon,
-    int B,                  // Number of batches
-    int C,                  // Number of channels
-    int H,                  // Height of feature map
-    int W                   // Width of feature map
-) {
-    int global_id = get_global_id(0);  // Get the global work item index
-
-    // Compute the corresponding batch, height, and channel indices
-    int n = global_id / C;       // Batch index
-    int c = global_id % C;                    // Height index
-    int h = get_global_id(1);                    // Channel index
-    int index = ((n * C + c) * H + h) * W;
-
-    // Calculate RMS norm for the current channel, height, and batch
-    half sum_squares = 0.0f;
-    for (int j = 0; j < W; ++j) {
-        sum_squares += input[index+j] * input[index+j];
-    }
-    sum_squares /= W;
-    half rms_norm = sqrt(sum_squares + epsilon);
-    // Each work item processes all width elements for its specific n, h, c
-    for (int w = 0; w < W; ++w) {
-        output[index+w] = (input[index+w] / rms_norm) * alpha[w];
-    } 
-}
-)";
-
 std::string rmsnorm_cl_kernel_ =
   R"(__kernel void rmsnorm_cl(
     __global const float *input,  // Input tensor
@@ -124,7 +89,6 @@ void RMSNormLayerCl::forwarding(RunLayerContext &context, bool training) {
 }
 
 opencl::Kernel RMSNormLayerCl::kernel_rmsnorm;
-opencl::Kernel RMSNormLayerCl::kernel_rmsnorm_fp16;
 
 void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result,
                                     Tensor const &gamma, const float epsilon) {
@@ -219,96 +183,6 @@ void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result,
   } while (false);
 }
 
-void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result,
-                                         Tensor const &gamma,
-                                         const float epsilon) {
-
-  bool ret = false;
-  int dim1 = input.batch() * input.height() * input.width() * input.channel();
-  CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(),
-                       input.width(), input.getTensorType());
-  int b = input.batch();
-  int c = input.channel();
-  int h = input.height();
-  int w = input.width();
-  do {
-    ClContext::SharedPtrClKernel kernel_rmsnorm_ptr =
-      cl_context_ref.registerClKernel(rmsnorm_cl_kernel_fp16_,
-                                      "rmsnorm_cl_fp16");
-    if (!kernel_rmsnorm_ptr) {
-      break;
-    }
-    opencl::Buffer inputbuf(cl_context_ref.context_inst_,
-                            dim1 * sizeof(cl_half), true, nullptr);
-
-    opencl::Buffer gammabuf(cl_context_ref.context_inst_,
-                            input.width() * sizeof(cl_half), true, nullptr);
-    opencl::Buffer resultbuf(cl_context_ref.context_inst_,
-                             dim1 * sizeof(cl_half), true, nullptr);
-
-    const _FP16 *data = input.getData<_FP16>();
-    _FP16 *rdata = result.getData<_FP16>();
-    const _FP16 *gdata = gamma.getData<_FP16>();
-    ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data);
-    if (!ret) {
-      break;
-    }
-
-    ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata);
-    if (!ret) {
-      break;
-    }
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem));
-    if (!ret) {
-      break;
-    }
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem));
-    if (!ret) {
-      break;
-    }
-
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem));
-    if (!ret) {
-      break;
-    }
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(4, &b, sizeof(int));
-    if (!ret) {
-      break;
-    }
-
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(3, &epsilon, sizeof(cl_half));
-    if (!ret) {
-      break;
-    }
-
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(5, &c, sizeof(int));
-    if (!ret) {
-      break;
-    }
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(6, &h, sizeof(int));
-    if (!ret) {
-      break;
-    }
-    ret = kernel_rmsnorm_ptr->SetKernelArguments(7, &w, sizeof(int));
-    if (!ret) {
-      break;
-    }
-    const int work_groups_count[3] = {b * c, h, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    ret = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_rmsnorm_ptr, work_groups_count, work_group_size);
-    if (!ret) {
-      break;
-    }
-
-    ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata);
-    if (!ret) {
-      break;
-    }
-  } while (false);
-}
-
 void RMSNormLayerCl::incremental_forwarding(nntrainer::RunLayerContext &context,
                                             unsigned int from, unsigned int to,
                                             bool training) {
@@ -339,7 +213,11 @@ void RMSNormLayerCl::incremental_forwarding(nntrainer::RunLayerContext &context,
   if (in_step.getDataType() == ml::train::TensorDim::DataType::FP32) {
     rmsnormProcess(in, out, gamma, epsilon);
   } else {
+#ifdef ENABLE_FP16
     rmsnormProcess_fp16(in, out, gamma, epsilon);
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
   }
 }
 
diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
index 43f942ea1e..a3b123dc6e 100644
--- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
+++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
@@ -119,7 +119,10 @@ class RMSNormLayerCl : public LayerImpl {
   const std::string getType() const override { return RMSNormLayerCl::type; };
 
   static opencl::Kernel kernel_rmsnorm;
+
+#ifdef ENABLE_FP16
   static opencl::Kernel kernel_rmsnorm_fp16;
+#endif
 
   /**
    * @brief Process data and dimensions for rms norm operation
diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp b/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp
new file mode 100644
index 0000000000..b788d877cd
--- /dev/null
+++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl_fp16.cpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Thummala Pallavi <t.pallavi@samsung.com>
+ *
+ * @file        rmsnorm_layer_fp16_cl.cpp
+ * @date        8 June 2024
+ * @brief       This is RMSNorm Layer Class for Neural Network with
+ * OpenCl implementation
+ * @see         https://github.com/nnstreamer/nntrainer
+ * @author      Thummala Pallavi <t.pallavi@samsung.com>
+ * @author      Eunju Yang <ej.yang@samsung.com>
+ * @bug         No known bugs except for NYI items
+ *
+ */
+
+
+#include <common_properties.h>
+#include <layer_context.h>
+#include <lazy_tensor.h>
+#include <nntrainer_error.h>
+#include <node_exporter.h>
+#include <rmsnorm_layer_cl.h>
+#include <util_func.h>
+
+std::string rmsnorm_cl_kernel_fp16_ =
+  R"(
+    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+    __kernel void rmsnorm_cl_fp16(
+    __global const half *input,  // Input tensor
+    __global half *output,    // Output tensor
+    __global const half *alpha,  // Alpha values (one for each width)
+    half epsilon,
+    int B,                  // Number of batches
+    int C,                  // Number of channels
+    int H,                  // Height of feature map
+    int W                   // Width of feature map
+) {
+    int global_id = get_global_id(0);  // Get the global work item index
+
+    // Compute the corresponding batch, height, and channel indices
+    int n = global_id / C;       // Batch index
+    int c = global_id % C;                    // Height index
+    int h = get_global_id(1);                    // Channel index
+    int index = ((n * C + c) * H + h) * W;
+
+    // Calculate RMS norm for the current channel, height, and batch
+    half sum_squares = 0.0f;
+    for (int j = 0; j < W; ++j) {
+        sum_squares += input[index+j] * input[index+j];
+    }
+    sum_squares /= W;
+    half rms_norm = sqrt(sum_squares + epsilon);
+    // Each work item processes all width elements for its specific n, h, c
+    for (int w = 0; w < W; ++w) {
+        output[index+w] = (input[index+w] / rms_norm) * alpha[w];
+    } 
+}
+)";
+
+namespace nntrainer {
+
+opencl::Kernel RMSNormLayerCl::kernel_rmsnorm_fp16;
+
+void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result,
+                                         Tensor const &gamma,
+                                         const float epsilon) {
+
+  bool ret = false;
+  int dim1 = input.batch() * input.height() * input.width() * input.channel();
+  CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(),
+                       input.width(), input.getTensorType());
+  int b = input.batch();
+  int c = input.channel();
+  int h = input.height();
+  int w = input.width();
+  do {
+    ClContext::SharedPtrClKernel kernel_rmsnorm_ptr =
+      cl_context_ref.registerClKernel(rmsnorm_cl_kernel_fp16_,
+                                      "rmsnorm_cl_fp16");
+    if (!kernel_rmsnorm_ptr) {
+      break;
+    }
+    opencl::Buffer inputbuf(cl_context_ref.context_inst_,
+                            dim1 * sizeof(cl_half), true, nullptr);
+
+    opencl::Buffer gammabuf(cl_context_ref.context_inst_,
+                            input.width() * sizeof(cl_half), true, nullptr);
+    opencl::Buffer resultbuf(cl_context_ref.context_inst_,
+                             dim1 * sizeof(cl_half), true, nullptr);
+
+    const _FP16 *data = input.getData<_FP16>();
+    _FP16 *rdata = result.getData<_FP16>();
+    const _FP16 *gdata = gamma.getData<_FP16>();
+    ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data);
+    if (!ret) {
+      break;
+    }
+
+    ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata);
+    if (!ret) {
+      break;
+    }
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem));
+    if (!ret) {
+      break;
+    }
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem));
+    if (!ret) {
+      break;
+    }
+
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem));
+    if (!ret) {
+      break;
+    }
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(4, &b, sizeof(int));
+    if (!ret) {
+      break;
+    }
+
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(3, &epsilon, sizeof(cl_half));
+    if (!ret) {
+      break;
+    }
+
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(5, &c, sizeof(int));
+    if (!ret) {
+      break;
+    }
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(6, &h, sizeof(int));
+    if (!ret) {
+      break;
+    }
+    ret = kernel_rmsnorm_ptr->SetKernelArguments(7, &w, sizeof(int));
+    if (!ret) {
+      break;
+    }
+    const int work_groups_count[3] = {b * c, h, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    ret = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_rmsnorm_ptr, work_groups_count, work_group_size);
+    if (!ret) {
+      break;
+    }
+
+    ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata);
+    if (!ret) {
+      break;
+    }
+  } while (false);
+}
+
+} // namespace nntrainer
+
diff --git a/nntrainer/layers/cl_layers/swiglu_cl.cpp b/nntrainer/layers/cl_layers/swiglu_cl.cpp
index c9c1891e44..ca991b2354 100644
--- a/nntrainer/layers/cl_layers/swiglu_cl.cpp
+++ b/nntrainer/layers/cl_layers/swiglu_cl.cpp
@@ -13,15 +13,6 @@
 #include "swiglu_cl.h"
 #include <iostream>
 
-std::string swiglu_cl_kernel_fp16_ =
-  R"(
-    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-    __kernel void swiglu_cl_fp16(__global const half *in1, __global const half *in2, __global half *out) {
-    int i = get_global_id(0);
-    half swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i]));
-    out[i] = swish * in2[i];
-})";
-
 std::string swiglu_cl_kernel_ =
   R"(__kernel void swiglu_cl(__global const float *in1, __global const float *in2, __global float *out) {
     int i = get_global_id(0);
@@ -64,7 +55,6 @@ void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context,
 }
 
 opencl::Kernel SwiGLULayerCl::kernel_swiglu;
-opencl::Kernel SwiGLULayerCl::kernel_swiglu_fp16;
 
 void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
                                   Tensor &result) {
@@ -104,84 +94,14 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
     }
 
     int dim = int(dim1 * dim2);
-    opencl::Buffer inputA(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
-
-    opencl::Buffer inputX(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
-
-    opencl::Buffer inOutY(cl_context_ref.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
-
-    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
-    if (!result) {
-      break;
-    }
-
-    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-    result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
-    if (!result) {
-      break;
-    }
-
-    const int work_groups_count[3] = {dim, 1, 1};
-    const int work_group_size[3] = {32, 32, 1}; // test-value
-
-    result = cl_context_ref.command_queue_inst_.DispatchCommand(
-      kernel_swiglu_ptr, work_groups_count, work_group_size);
-    if (!result) {
-      break;
-    }
-
-    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
-    if (!result) {
-      break;
-    }
-
-  } while (false);
-}
-
-void SwiGLULayerCl::swiglu_cl_fp16(const _FP16 *matAdata,
-                                   const _FP16 *vecXdata, _FP16 *vecYdata,
-                                   unsigned int dim1, unsigned int dim2){
-
-  bool result = false;
-
-  do {
-    ClContext::SharedPtrClKernel kernel_swiglu_ptr =
-      cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16");
-    if (!kernel_swiglu_ptr) {
-      break;
-    }
-
-    int dim = int(dim1 * dim2);
-    opencl::Buffer inputA(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inputX(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inOutY(cl_context_ref.context_inst_, sizeof(_FP16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
     result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
diff --git a/nntrainer/layers/cl_layers/swiglu_cl.h b/nntrainer/layers/cl_layers/swiglu_cl.h
index 215d817c1e..51f388bd88 100644
--- a/nntrainer/layers/cl_layers/swiglu_cl.h
+++ b/nntrainer/layers/cl_layers/swiglu_cl.h
@@ -79,7 +79,7 @@ class SwiGLULayerCl final : public Layer {
    * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
    */
   void exportTo(Exporter &exporter,
-                const ml::train::ExportMethods &method) const override {};
+                const ml::train::ExportMethods &method) const override{};
 
   /**
    * @copydoc Layer::getType()
@@ -94,7 +94,10 @@ class SwiGLULayerCl final : public Layer {
   inline static const std::string type = "swiglu";
 
   static opencl::Kernel kernel_swiglu;
+
+#ifdef ENABLE_FP16
   static opencl::Kernel kernel_swiglu_fp16;
+#endif
 
   std::tuple<props::Print> swiglu_props; /**< swiglu layer properties : unit -
                                             number of output neurons */
diff --git a/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp b/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp
new file mode 100644
index 0000000000..3fe2d1f701
--- /dev/null
+++ b/nntrainer/layers/cl_layers/swiglu_cl_fp16.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ *
+ * @file   swiglu_cl.cpp
+ * @date   6th June 2024
+ * @brief  Implementation of SwiGLU activation function
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#include "swiglu_cl.h"
+#include <iostream>
+
+std::string swiglu_cl_kernel_fp16_ =
+  R"(
+    #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+    __kernel void swiglu_cl_fp16(__global const half *in1, __global const half *in2, __global half *out) {
+    int i = get_global_id(0);
+    half swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i]));
+    out[i] = swish * in2[i];
+})";
+
+namespace nntrainer {
+
+void SwiGLULayerCl::swiglu_cl_fp16(const _FP16 *matAdata, const _FP16 *vecXdata,
+                                   _FP16 *vecYdata, unsigned int dim1,
+                                   unsigned int dim2) {
+
+  bool result = false;
+
+  do {
+    ClContext::SharedPtrClKernel kernel_swiglu_ptr =
+      cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16");
+    if (!kernel_swiglu_ptr) {
+      break;
+    }
+
+    int dim = int(dim1 * dim2);
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * dim1 * dim2, true, nullptr);
+
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * dim1 * dim2, true, nullptr);
+
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(_FP16) * dim1 * dim2, true, nullptr);
+
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {dim, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_swiglu_ptr, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+} // namespace nntrainer
+
diff --git a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp
index c1284b0a9c..b4c3978dee 100644
--- a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp
+++ b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp
@@ -16,7 +16,7 @@
 
 namespace nntrainer {
 
-void rotary_emb_cl(__fp16 *in, __fp16 *out,
+void rotary_emb_cl(_FP16 *in, _FP16 *out,
                    std::vector<std::vector<float>> freqs_cos,
                    std::vector<std::vector<float>> freqs_sin,
                    std::vector<float> cos_, std::vector<float> sin_,
diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
index 44d0d023c7..9235605eeb 100644
--- a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
+++ b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
@@ -10,6 +10,7 @@
  * @bug		No known bugs except for NYI items
  *
  */
+#ifdef ENABLE_FP16
 
 #include <blas_kernel_strings.h>
 #include <blas_kernels.h>
@@ -402,3 +403,5 @@ void sscal_cl(_FP16 *X, const unsigned int N, const float alpha){
   } while (false);
 }
 } // namespace nntrainer
+
+#endif