Skip to content

Commit

Permalink
[Blas] copy functionality for signed int8 data type
Browse files Browse the repository at this point in the history
This pull request aims at adding the functionality to copy the int8 data type into other types such as int8, fp16, and fp32.
Please note that this implementation follows the intrinsic used for copying uint8 values.
By including this feature, we can expect more flexibility in handling different data types which will contribute to overall system performance improvement.

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <[email protected]>
  • Loading branch information
djeong20 committed Dec 23, 2024
1 parent 712f6c3 commit ccf0906
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 15 deletions.
75 changes: 65 additions & 10 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,21 +263,34 @@ static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X,

static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X,
const int incX, _FP16 *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);
unsigned int inc_x = abs(incX);
unsigned int inc_y = abs(incY);

#if (defined USE__FP16 && USE_NEON)
if (incX == 1 && incY == 1) {
nntrainer::neon::copy_int8_to_fp16(N, X, Y);
} else {
throw std::invalid_argument(
"Error: incX == 1 && incY == 1 is supported only");
return;
}
#else
#endif
for (unsigned int idx = 0; idx < N; idx++) {
Y[idx] = X[idx];
Y[idx * inc_y] = X[idx * inc_x];
}
}

static void copy_int8_to_fp16(const unsigned int N, const int8_t *X,
const int incX, _FP16 *Y, const int incY) {
unsigned int inc_x = abs(incX);
unsigned int inc_y = abs(incY);

#if (defined USE__FP16 && USE_NEON)
if (incX == 1 && incY == 1) {
nntrainer::neon::copy_int8_to_fp16(N, X, Y);
return;
}
#endif
for (unsigned int idx = 0; idx < N; idx++) {
Y[idx * inc_y] = X[idx * inc_x];
}
}

void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
Expand Down Expand Up @@ -423,6 +436,11 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
copy_int8_to_fp16(N, X, incX, Y, incY);
}

void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
const int incX, _FP16 *Y, const int incY) {
copy_int8_to_fp16(N, X, incX, Y, incY);
}

static void ele_mul_fallback(const unsigned int N, const _FP16 *X,
const _FP16 *Y, _FP16 *Z, float alpha, float beta,
unsigned int i_stride, unsigned int o_stride) {
Expand Down Expand Up @@ -901,6 +919,22 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
#endif
}

void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
const int incY) {
unsigned int inc_x = abs(incX);
unsigned int inc_y = abs(incY);

#ifdef USE_NEON
if (incX == 1 && incY == 1) {
nntrainer::neon::copy_int8(N, X, Y);
return;
}
#endif
for (unsigned int idx = 0; idx < N; idx++) {
Y[idx * inc_y] = X[idx * inc_x];
}
}

void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
const int incX, float *Y, const int incY) {
#ifdef USE_NEON
Expand All @@ -915,13 +949,34 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,

void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
const int incX, float *Y, const int incY) {
unsigned int inc_x = abs(incX);
unsigned int inc_y = abs(incY);

#ifdef USE_NEON
nntrainer::neon::copy_int8_to_fp32(N, X, Y);
#else
if (incX == 1 && incY == 1) {
nntrainer::neon::copy_int8_to_fp32(N, X, Y);
return;
}
#endif
for (unsigned int idx = 0; idx < N; idx++) {
Y[idx] = X[idx];
Y[idx * inc_y] = X[idx * inc_x];
}
}

void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
const int incX, float *Y, const int incY) {
unsigned int inc_x = abs(incX);
unsigned int inc_y = abs(incY);

#ifdef USE_NEON
if (incX == 1 && incY == 1) {
nntrainer::neon::copy_int8_to_fp32(N, X, Y);
return;
}
#endif
for (unsigned int idx = 0; idx < N; idx++) {
Y[idx * inc_y] = X[idx * inc_x];
}
}

float snrm2(const int N, const float *X, const int incX) {
Expand Down
28 changes: 28 additions & 0 deletions nntrainer/tensor/blas_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
const int incX, _FP16 *Y, const int incY);

/**
* @brief copy function : Y = X
* @param[in] N number of elements in X
* @param[in] X int8_t * for Vector X
* @param[in] Y __fp16 * for Vector Y
*/
void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
const int incX, _FP16 *Y, const int incY);

/**
* @brief sdot computation : sum of all X * Y
* @param[in] N number of elements in Y
Expand Down Expand Up @@ -274,6 +283,16 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
*/
void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
const int intY);

/**
* @brief copy function : Y = X
* @param[in] N number of elements in X
* @param[in] X int8_t * for Vector X
* @param[in] Y int8_t * for Vector Y
*/
void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
const int intY);

/**
* @brief copy function : Y = X
* @param[in] N number of elements in X
Expand All @@ -292,6 +311,15 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
const int incX, float *Y, const int intY);

/**
* @brief copy function : Y = X
* @param[in] N number of elements in X
* @param[in] X uint8_t * for Vector X
* @param[in] Y float * for Vector Y
*/
void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
const int incX, float *Y, const int intY);

/**
* @brief sdot computation : sum of all X * Y
* @param[in] N number of elements in Y
Expand Down
86 changes: 86 additions & 0 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,22 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) {
}
}

void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y) {
///@note int8 Tensor and int4 Tensor share the same memory offset
unsigned int idx = 0;
for (; N - idx >= 16; idx += 16) {
int8x16_t batch = vld1q_s8(&X[idx]);
vst1q_s8(&Y[idx], batch);
}
for (; N - idx >= 8; idx += 8) {
int8x8_t batch = vld1_s8(&X[idx]);
vst1_s8(&Y[idx], batch);
}
for (; N - idx >= 1; ++idx) {
Y[idx] = X[idx];
}
}

void sine(const unsigned int N, float *X, float *Y, float alpha) {
unsigned int i = 0;
for (; N - i >= 4; i += 4) {
Expand Down Expand Up @@ -1469,6 +1485,34 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) {
}
}

void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y) {
unsigned int idx = 0;
for (; (N - idx) >= 16; idx += 16) {
int8x16_t batch = vld1q_s8(&X[idx]);
int8x8_t low = vget_low_s8(batch);
int8x8_t high = vget_high_s8(batch);

// convert to s16
int16x8_t batch_low_s16 = vmovl_s8(low);
int16x8_t batch_high_s16 = vmovl_s8(high);

// todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
// time/accuracy
vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_low_s16));
vst1q_f16(&Y[idx + 8], vcvtq_f16_s16(batch_high_s16));
}
for (; (N - idx) >= 8; idx += 8) {
int8x8_t batch = vld1_s8(&X[idx]);

// convert to s16
int16x8_t batch_s16 = vmovl_s8(batch);
vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_s16));
}
for (; (N - idx) >= 1; ++idx) {
Y[idx] = X[idx];
}
}

void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
unsigned int idx = 0;
for (; (N - idx) >= 16; idx += 16) {
Expand Down Expand Up @@ -1511,6 +1555,48 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
}
}

void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
unsigned int idx = 0;
for (; (N - idx) >= 16; idx += 16) {
int8x16_t batch = vld1q_s8(&X[idx]);
int8x8_t low = vget_low_s8(batch);
int8x8_t high = vget_high_s8(batch);

// convert to s16
int16x8_t batch_low_s16 = vmovl_s8(low);
int16x8_t batch_high_s16 = vmovl_s8(high);

// convert to s32
int32x4_t batch_low_s32_low = vmovl_s16(vget_low_s16(batch_low_s16));
int32x4_t batch_low_s32_high = vmovl_s16(vget_high_s16(batch_low_s16));
int32x4_t batch_high_s32_low = vmovl_s16(vget_low_s16(batch_high_s16));
int32x4_t batch_high_s32_high = vmovl_s16(vget_high_s16(batch_high_s16));

// todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
// time/accuracy
vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_low_s32_low));
vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_low_s32_high));
vst1q_f32(&Y[idx + 8], vcvtq_f32_s32(batch_high_s32_low));
vst1q_f32(&Y[idx + 12], vcvtq_f32_s32(batch_high_s32_high));
}
for (; (N - idx) >= 8; idx += 8) {
int8x8_t batch = vld1_s8(&X[idx]);

// convert to s16
int16x8_t batch_s16 = vmovl_s8(batch);

// convert to s32
int32x4_t batch_s32_low = vmovl_s16(vget_low_s16(batch_s16));
int32x4_t batch_s32_high = vmovl_s16(vget_high_s16(batch_s16));

vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_s32_low));
vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_s32_high));
}
for (; (N - idx) >= 1; ++idx) {
Y[idx] = X[idx];
}
}

void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
unsigned int idx = 0;

Expand Down
24 changes: 24 additions & 0 deletions nntrainer/tensor/blas_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,29 @@ void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
*/
void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);

/**
* @brief copy function with neon: Y = X
* @param[in] N number of elements in X
* @param[in] X int8_t * for Vector X
* @param[in] Y float * for Vector Y
*/
void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y);

/**
* @brief copy function with neon: Y = X
* @param[in] N number of elements in X
* @param[in] X uint8_t * for Vector X
* @param[in] Y uint8_t * for Vector Y
*/
void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);

/**
* @brief copy function with neon: Y = X
* @param[in] N number of elements in X
* @param[in] X int8_t * for Vector X
* @param[in] Y int8_t * for Vector Y
*/
void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
/**
* @brief sine with neon: Y = sin(alpha * X)
* @param[in] N number of elements in X
Expand Down Expand Up @@ -311,6 +327,14 @@ void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);
*/
void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);

/**
* @brief copy function with neon: Y = X
* @param[in] N number of elements in X
* @param[in] X int8_t * for Vector X
* @param[in] Y __fp16 * for Vector Y
*/
void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y);

/**
* @brief copy function with neon: Y = X
* @param[in] N number of elements in X
Expand Down
4 changes: 1 addition & 3 deletions nntrainer/tensor/char_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,7 @@ void CharTensor::copy(const void *buf) {
}

/// @todo need to optimize
for (unsigned int i = 0; i < size(); ++i) {
((int8_t *)getData())[i] = ((int8_t *)buf)[i];
}
scopy(size(), (int8_t *)buf, 1, (int8_t *)getData(), 1);
}

void CharTensor::save_quantization_info(std::ostream &file) {
Expand Down
2 changes: 1 addition & 1 deletion nntrainer/tensor/float_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ void FloatTensor::copyData(const Tensor &from) {
#endif
break;
case ml::train::TensorDim::DataType::QINT8:
scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
(float *)getData(), 1);
break;
default:
Expand Down
2 changes: 1 addition & 1 deletion nntrainer/tensor/half_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -983,7 +983,7 @@ void HalfTensor::copyData(const Tensor &from) {
copy(from.getData<_FP16>());
break;
case ml::train::TensorDim::DataType::QINT8:
scopy_int8_to_float16(from.size(), from.getData<uint8_t>(), 1,
scopy_int8_to_float16(from.size(), from.getData<int8_t>(), 1,
(_FP16 *)getData(), 1);
break;
default:
Expand Down

0 comments on commit ccf0906

Please sign in to comment.