nnstreamer · skykongkong8 · Jan 16, 2025
@@ -979,13 +979,18 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
   }
 }
 
+static inline void copy_s16_fp32_fallback(const unsigned int N,
+                                          const int16_t *X, float *Y) {
+  for (unsigned int idx = 0; idx < N; ++idx) {
+    Y[idx] = (float)X[idx];
+  }
+}
+
 void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
 #ifdef USE_NEON
   nntrainer::neon::copy_s16_fp32(N, X, Y);
 #endif
-  for (unsigned int idx = 0; idx < N; ++idx) {
-    Y[idx] = (float)X[idx];
-  }
+  copy_s16_fp32_fallback(N, X, Y);
 }
 
 float snrm2(const int N, const float *X, const int incX) {

@@ -1598,8 +1598,23 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
 }
 
 void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
-  /// @todo implement int16_t to fp32
   unsigned int idx = 0;
+  for (; (N - idx) >= 8; idx += 8) {
+    int16x8_t batch = vld1q_s16(&X[idx]);
+    int16x4_t low = vget_low_s16(batch);
+    int16x4_t high = vget_high_s16(batch);
+
+    // widen to s32
+    int32x4_t low_s32 = vmovl_s16(low);
+    int32x4_t high_s32 = vmovl_s16(high);
+
+    // convert to f32
+    float32x4_t low_f32 = vcvtq_f32_s32(low_s32);
+    float32x4_t high_f32 = vcvtq_f32_s32(high_s32);
+
+    vst1q_f32(&Y[idx], low_f32);
+    vst1q_f32(&Y[idx + 4], high_f32);
+  }
   for (; (N - idx) >= 1; ++idx) {
     Y[idx] = X[idx];
   }