From a3fd8d384f29e0febc4a348a27f2c68cde4582e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?=
 =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?=
 <branimirkaradzic@gmail.com>
Date: Thu, 6 Feb 2020 08:31:17 -0800
Subject: [PATCH] Added cmpneq.

---
 include/bx/inline/simd128_langext.inl |  8 ++++++
 include/bx/inline/simd128_neon.inl    | 40 +++++++++++++--------------
 include/bx/inline/simd128_ref.inl     | 11 ++++++++
 include/bx/inline/simd128_sse.inl     |  6 ++++
 include/bx/inline/simd_ni.inl         |  9 ++++++
 include/bx/simd_t.h                   |  3 ++
 6 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/include/bx/inline/simd128_langext.inl b/include/bx/inline/simd128_langext.inl
index d48e55376..a0a66affd 100644
--- a/include/bx/inline/simd128_langext.inl
+++ b/include/bx/inline/simd128_langext.inl
@@ -346,6 +346,14 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpneq(simd128_langext_t _a, simd128_langext_t _b)
+	{
+		simd128_langext_t result;
+		result.vi = _a.vf != _b.vf;
+		return result;
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmplt(simd128_langext_t _a, simd128_langext_t _b)
 	{
diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl
index 5dfce0de5..dd1ebb0bd 100644
--- a/include/bx/inline/simd128_neon.inl
+++ b/include/bx/inline/simd128_neon.inl
@@ -9,23 +9,15 @@
 
 namespace bx
 {
-#if BX_COMPILER_CLANG
 
-#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
-__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
-#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
-__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
-	
+#if BX_COMPILER_CLANG
+#	define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)     __builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
+#	define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
 #else
-
-#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
-__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
-#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
-__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
-
+#	define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)     __builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
+#	define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
 #endif
 
-	
 #define ELEMx 0
 #define ELEMy 1
 #define ELEMz 2
@@ -291,11 +283,17 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpneq(simd128_neon_t _a, simd128_neon_t _b)
+	{
+		return simd_cmpneq_ni(_a, _b);
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcltq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcltq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -303,8 +301,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcleq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcleq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -312,8 +310,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcgtq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcgtq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -321,8 +319,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcgeq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcgeq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
diff --git a/include/bx/inline/simd128_ref.inl b/include/bx/inline/simd128_ref.inl
index b99976e29..7b91af517 100644
--- a/include/bx/inline/simd128_ref.inl
+++ b/include/bx/inline/simd128_ref.inl
@@ -396,6 +396,17 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpneq(simd128_ref_t _a, simd128_ref_t _b)
+	{
+		simd128_ref_t result;
+		result.ixyzw[0] = _a.fxyzw[0] != _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] != _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] != _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] != _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b)
 	{
diff --git a/include/bx/inline/simd128_sse.inl b/include/bx/inline/simd128_sse.inl
index 2fe09e33c..79f185bbc 100644
--- a/include/bx/inline/simd128_sse.inl
+++ b/include/bx/inline/simd128_sse.inl
@@ -308,6 +308,12 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return _mm_cmpeq_ps(_a, _b);
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpneq(simd128_sse_t _a, simd128_sse_t _b)
+	{
+		return _mm_cmpneq_ps(_a, _b);
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b)
 	{
diff --git a/include/bx/inline/simd_ni.inl b/include/bx/inline/simd_ni.inl
index 69d5110f4..499e54f26 100644
--- a/include/bx/inline/simd_ni.inl
+++ b/include/bx/inline/simd_ni.inl
@@ -124,6 +124,15 @@ namespace bx
 		return result;
 	}
 
+	template<typename Ty>
+	BX_SIMD_INLINE Ty simd_cmpneq_ni(Ty _a, Ty _b)
+	{
+		const Ty tmp0   = simd_cmpeq(_a, _b);
+		const Ty result = simd_not(tmp0);
+
+		return result;
+	}
+
 	template<typename Ty>
 	BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b)
 	{
diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h
index 0b0756731..5e5fefa07 100644
--- a/include/bx/simd_t.h
+++ b/include/bx/simd_t.h
@@ -196,6 +196,9 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 	template<typename Ty>
 	Ty simd_cmpeq(Ty _a, Ty _b);
 
+	template<typename Ty>
+	Ty simd_cmpneq(Ty _a, Ty _b);
+
 	template<typename Ty>
 	Ty simd_cmplt(Ty _a, Ty _b);