[Blas] copy functionality for signed int8 data type

This pull request aims at adding the functionality to copy the int8 data type into other types such as int8, fp16, and fp32. Please note that this implementation follows the intrinsic used for copying uint8 values. By including this feature, we can expect more flexibility in handling different data types which will contribute to overall system performance improvement. **Self-evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Donghyeon Jeong <[email protected]>
nnstreamer · Dec 23, 2024 · ccf0906 · ccf0906
1 parent 712f6c3
commit ccf0906
Show file tree

Hide file tree

Showing 7 changed files with 206 additions and 15 deletions.
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
@@ -263,21 +263,34 @@ static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X,
 
 static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X,
                               const int incX, _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
 
 #if (defined USE__FP16 && USE_NEON)
   if (incX == 1 && incY == 1) {
     nntrainer::neon::copy_int8_to_fp16(N, X, Y);
-  } else {
-    throw std::invalid_argument(
-      "Error: incX == 1 && incY == 1 is supported only");
+    return;
   }
-#else
+#endif
   for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
+static void copy_int8_to_fp16(const unsigned int N, const int8_t *X,
+                              const int incX, _FP16 *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#if (defined USE__FP16 && USE_NEON)
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp16(N, X, Y);
+    return;
   }
 #endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
 }
 
 void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
@@ -423,6 +436,11 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
   copy_int8_to_fp16(N, X, incX, Y, incY);
 }
 
+void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
+                           const int incX, _FP16 *Y, const int incY) {
+  copy_int8_to_fp16(N, X, incX, Y, incY);
+}
+
 static void ele_mul_fallback(const unsigned int N, const _FP16 *X,
                              const _FP16 *Y, _FP16 *Z, float alpha, float beta,
                              unsigned int i_stride, unsigned int o_stride) {
@@ -901,6 +919,22 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
 #endif
 }
 
+void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
+           const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#ifdef USE_NEON
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8(N, X, Y);
+    return;
+  }
+#endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
 void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int incY) {
 #ifdef USE_NEON
@@ -915,13 +949,34 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
 
 void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
 #ifdef USE_NEON
-  nntrainer::neon::copy_int8_to_fp32(N, X, Y);
-#else
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp32(N, X, Y);
+    return;
+  }
+#endif
   for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
+void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
+                           const int incX, float *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#ifdef USE_NEON
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp32(N, X, Y);
+    return;
   }
 #endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
 }
 
 float snrm2(const int N, const float *X, const int incX) {

diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
@@ -86,6 +86,15 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
                            const int incX, _FP16 *Y, const int incY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
+                           const int incX, _FP16 *Y, const int incY);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y
@@ -274,6 +283,16 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
  */
 void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
            const int intY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y int8_t * for Vector Y
+ */
+void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
+           const int intY);
+
 /**
  * @brief     copy function : Y = X
  * @param[in] N number of elements in X
@@ -292,6 +311,15 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int intY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
+                           const int incX, float *Y, const int intY);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y

diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
@@ -404,6 +404,22 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) {
   }
 }
 
+void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y) {
+  ///@note int8 Tensor and int4 Tensor share the same memory offset
+  unsigned int idx = 0;
+  for (; N - idx >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    vst1q_s8(&Y[idx], batch);
+  }
+  for (; N - idx >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+    vst1_s8(&Y[idx], batch);
+  }
+  for (; N - idx >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void sine(const unsigned int N, float *X, float *Y, float alpha) {
   unsigned int i = 0;
   for (; N - i >= 4; i += 4) {
@@ -1469,6 +1485,34 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) {
   }
 }
 
+void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    int8x8_t low = vget_low_s8(batch);
+    int8x8_t high = vget_high_s8(batch);
+
+    // convert to s16
+    int16x8_t batch_low_s16 = vmovl_s8(low);
+    int16x8_t batch_high_s16 = vmovl_s8(high);
+
+    // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
+    // time/accuracy
+    vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_low_s16));
+    vst1q_f16(&Y[idx + 8], vcvtq_f16_s16(batch_high_s16));
+  }
+  for (; (N - idx) >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+
+    // convert to s16
+    int16x8_t batch_s16 = vmovl_s8(batch);
+    vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_s16));
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
   unsigned int idx = 0;
   for (; (N - idx) >= 16; idx += 16) {
@@ -1511,6 +1555,48 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
   }
 }
 
+void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    int8x8_t low = vget_low_s8(batch);
+    int8x8_t high = vget_high_s8(batch);
+
+    // convert to s16
+    int16x8_t batch_low_s16 = vmovl_s8(low);
+    int16x8_t batch_high_s16 = vmovl_s8(high);
+
+    // convert to s32
+    int32x4_t batch_low_s32_low = vmovl_s16(vget_low_s16(batch_low_s16));
+    int32x4_t batch_low_s32_high = vmovl_s16(vget_high_s16(batch_low_s16));
+    int32x4_t batch_high_s32_low = vmovl_s16(vget_low_s16(batch_high_s16));
+    int32x4_t batch_high_s32_high = vmovl_s16(vget_high_s16(batch_high_s16));
+
+    // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
+    // time/accuracy
+    vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_low_s32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_low_s32_high));
+    vst1q_f32(&Y[idx + 8], vcvtq_f32_s32(batch_high_s32_low));
+    vst1q_f32(&Y[idx + 12], vcvtq_f32_s32(batch_high_s32_high));
+  }
+  for (; (N - idx) >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+
+    // convert to s16
+    int16x8_t batch_s16 = vmovl_s8(batch);
+
+    // convert to s32
+    int32x4_t batch_s32_low = vmovl_s16(vget_low_s16(batch_s16));
+    int32x4_t batch_s32_high = vmovl_s16(vget_high_s16(batch_s16));
+
+    vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_s32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_s32_high));
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 

diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
@@ -65,13 +65,29 @@ void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
  */
 void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
 
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y);
+
 /**
  * @brief     copy function with neon: Y = X
  * @param[in] N number of elements in X
  * @param[in] X uint8_t * for Vector X
  * @param[in] Y uint8_t * for Vector Y
  */
 void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y int8_t * for Vector Y
+ */
+void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
  * @param[in] N number of elements in X
@@ -311,6 +327,14 @@ void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);
  */
 void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);
 
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y);
+
 /**
  * @brief     copy function with neon: Y = X
  * @param[in] N number of elements in X

diff --git a/nntrainer/tensor/char_tensor.cpp b/nntrainer/tensor/char_tensor.cpp
@@ -361,9 +361,7 @@ void CharTensor::copy(const void *buf) {
   }
 
   /// @todo need to optimize
-  for (unsigned int i = 0; i < size(); ++i) {
-    ((int8_t *)getData())[i] = ((int8_t *)buf)[i];
-  }
+  scopy(size(), (int8_t *)buf, 1, (int8_t *)getData(), 1);
 }
 
 void CharTensor::save_quantization_info(std::ostream &file) {

diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
@@ -764,7 +764,7 @@ void FloatTensor::copyData(const Tensor &from) {
 #endif
     break;
   case ml::train::TensorDim::DataType::QINT8:
-    scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
+    scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
                           (float *)getData(), 1);
     break;
   default:

diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp
@@ -983,7 +983,7 @@ void HalfTensor::copyData(const Tensor &from) {
     copy(from.getData<_FP16>());
     break;
   case ml::train::TensorDim::DataType::QINT8:
-    scopy_int8_to_float16(from.size(), from.getData<uint8_t>(), 1,
+    scopy_int8_to_float16(from.size(), from.getData<int8_t>(), 1,
                           (_FP16 *)getData(), 1);
     break;
   default: