From d5d22de38cb02f806fc8116254150942dee71d19 Mon Sep 17 00:00:00 2001
From: "-T.K.-" <t_k_233@outlook.com>
Date: Fri, 13 Dec 2024 15:58:27 -0800
Subject: [PATCH] ADD: add 1d softmax

---
 CMakeLists.txt                         |  1 +
 converter/src/torchconverter/tracer.py |  4 ++++
 src/cpu/softmax.c                      | 30 ++++++++++++++++++++++++++
 tests/generate_test.py                 |  3 +++
 tests/src/generated.c                  | 30 ++++++++++++++++++++++++++
 5 files changed, 68 insertions(+)
 create mode 100644 src/cpu/softmax.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 221efdf..a41172b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,7 @@ set(cpu_impl
   ./src/cpu/mul.c
   ./src/cpu/mulscalar.c
   ./src/cpu/relu.c
+  ./src/cpu/softmax.c
   ./src/cpu/tanh.c
 )
 
diff --git a/converter/src/torchconverter/tracer.py b/converter/src/torchconverter/tracer.py
index 340ebbf..300548c 100644
--- a/converter/src/torchconverter/tracer.py
+++ b/converter/src/torchconverter/tracer.py
@@ -322,6 +322,10 @@ def handle_call_module(self, n: torch.fx.node.Node, out: torch.Tensor):
         elif type(module) == torch.nn.Tanh:
             self.add_uninitialized_tensor(layer_name, out)
             self.add_forward_call("nn_tanh{dim}d_{dtype}", out, layer_name, input_names)
+
+        elif type(module) == torch.nn.Softmax:
+            self.add_uninitialized_tensor(layer_name, out)
+            self.add_forward_call("nn_softmax{dim}d_{dtype}", out, layer_name, input_names)
     
         # Linear Layers
         elif type(module) == torch.nn.Linear:
diff --git a/src/cpu/softmax.c b/src/cpu/softmax.c
new file mode 100644
index 0000000..247c727
--- /dev/null
+++ b/src/cpu/softmax.c
@@ -0,0 +1,30 @@
+#include "nn.h"
+
+
+__attribute__((weak)) void nn_softmax1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x) {
+  nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
+
+  size_t n = y->shape[0];
+  float sum = 0.0f;
+  for (size_t i = 0; i < n; i += 1) {
+    sum += expf(as_f32(x->data[i]));
+  }
+
+  for (size_t i = 0; i < n; i += 1) {
+    y->data[i] = as_f16(expf(as_f32(x->data[i])) / sum);
+  }
+}
+
+__attribute__((weak)) void nn_softmax1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x) {
+  nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
+
+  size_t n = y->shape[0];
+  float sum = 0.0f;
+  for (size_t i = 0; i < n; i += 1) {
+    sum += expf(x->data[i]);
+  }
+
+  for (size_t i = 0; i < n; i += 1) {
+    y->data[i] = expf(x->data[i]) / sum;
+  }
+}
diff --git a/tests/generate_test.py b/tests/generate_test.py
index 182bc3b..96abd38 100644
--- a/tests/generate_test.py
+++ b/tests/generate_test.py
@@ -300,6 +300,9 @@ def generate(self, out_file: str):
     # Tanh
     t.add_test("nn_tanh2d_f32",  lambda x: torch.nn.functional.tanh(x),               [("x", t.rand((7, 7)))                                                ])
 
+    # Softmax
+    t.add_test("nn_softmax1d_f32",  lambda x: torch.nn.functional.softmax(x),         [("x", t.rand((7, )))])
+
 
     t.generate(out_file)
 
diff --git a/tests/src/generated.c b/tests/src/generated.c
index 74c86ee..06b2549 100644
--- a/tests/src/generated.c
+++ b/tests/src/generated.c
@@ -1103,4 +1103,34 @@ int main() {
       // nn_free_tensor_data(actual);
   }
 
+  {
+    printf("nn_softmax1d_f32:       ");
+    
+    // [ 1.463511   -2.0732946   2.5087662  -0.44954896  0.43284953 -3.458044  -4.46862
+    Tensor1D_F32 x = {
+      .shape = { 7 },
+      .data = (float *)((uint8_t[]){ 0x54,0x54,0xbb,0x3f,0xdc,0xb0,0x4,0xc0,0xa0,0x8f,0x20,0x40,0x48,0x2b,0xe6,0xbe,0x74,0x9e,0xdd,0x3e,0x98,0x50,0x5d,0xc0,0x0,0xff,0x8e,0xc0 })
+    };
+
+    
+    // [2.2791658e-01 6.6337758e-03 6.4822310e-01 3.3646863e-02 8.1313998e-02  1.661008
+    Tensor1D_F32 golden = {
+      .shape = { 7 },
+      .data = (float *)((uint8_t[]){ 0xf7,0x62,0x69,0x3e,0x25,0x60,0xd9,0x3b,0xf3,0xf1,0x25,0x3f,0x4b,0xd1,0x9,0x3d,0xf4,0x87,0xa6,0x3d,0x2f,0xb6,0xd9,0x3a,0x32,0x7f,0x1e,0x3a })
+    };
+    // 
+    Tensor1D_F32 actual = {
+      .shape = { 7 },
+      .data = (float *)malloc(sizeof(float) * 7)
+    };
+
+    cycles = read_cycles();
+    nn_softmax1d_f32(&actual, &x);
+    cycles = read_cycles() - cycles;
+    printf("%s  (%lu cycles)\n", nn_equals1d_f32(&golden, &actual, 1e-4) ? "PASS" : "FAIL", cycles);
+
+    
+      // nn_free_tensor_data(actual);
+  }
+
 }
\ No newline at end of file